1#![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)]
4
5#[cfg(test)]
6mod tests;
7
8use nohash_hasher::IntMap;
9
10pub use text_size::{TextRange, TextSize};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub struct LineCol {
15 pub line: u32,
17 pub col: u32,
19}
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23#[non_exhaustive]
24pub enum WideEncoding {
25 Utf16,
27 Utf32,
29}
30
31impl WideEncoding {
32 pub fn measure(&self, text: &str) -> usize {
34 match self {
35 WideEncoding::Utf16 => text.encode_utf16().count(),
36 WideEncoding::Utf32 => text.chars().count(),
37 }
38 }
39}
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
47pub struct WideLineCol {
48 pub line: u32,
50 pub col: u32,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55struct WideChar {
56 start: TextSize,
58 end: TextSize,
60}
61
62impl WideChar {
63 fn len(&self) -> TextSize {
65 self.end - self.start
66 }
67
68 fn wide_len(&self, enc: WideEncoding) -> u32 {
70 match enc {
71 WideEncoding::Utf16 => {
72 if self.len() == TextSize::from(4) {
73 2
74 } else {
75 1
76 }
77 }
78 WideEncoding::Utf32 => 1,
79 }
80 }
81}
82
83#[derive(Debug, Clone, PartialEq, Eq)]
85pub struct LineIndex {
86 newlines: Box<[TextSize]>,
88 line_wide_chars: IntMap<u32, Box<[WideChar]>>,
90 len: TextSize,
92}
93
94impl LineIndex {
95 pub fn new(text: &str) -> LineIndex {
97 let (newlines, line_wide_chars) = analyze_source_file(text);
98 LineIndex {
99 newlines: newlines.into_boxed_slice(),
100 line_wide_chars,
101 len: TextSize::of(text),
102 }
103 }
104
105 pub fn line_col(&self, offset: TextSize) -> LineCol {
111 self.try_line_col(offset).expect("invalid offset")
112 }
113
114 pub fn try_line_col(&self, offset: TextSize) -> Option<LineCol> {
119 if offset > self.len {
120 return None;
121 }
122 let line = self.newlines.partition_point(|&it| it <= offset);
123 let start = self.start_offset(line)?;
124 let col = offset - start;
125 let ret = LineCol { line: line as u32, col: col.into() };
126 self.line_wide_chars
127 .get(&ret.line)
128 .into_iter()
129 .flat_map(|it| it.iter())
130 .all(|it| col <= it.start || it.end <= col)
131 .then_some(ret)
132 }
133
134 pub fn offset(&self, line_col: LineCol) -> Option<TextSize> {
136 self.start_offset(line_col.line as usize).map(|start| start + TextSize::from(line_col.col))
137 }
138
139 fn start_offset(&self, line: usize) -> Option<TextSize> {
140 match line.checked_sub(1) {
141 None => Some(TextSize::from(0)),
142 Some(it) => self.newlines.get(it).copied(),
143 }
144 }
145
146 pub fn to_wide(&self, enc: WideEncoding, line_col: LineCol) -> Option<WideLineCol> {
148 let mut col = line_col.col;
149 if let Some(wide_chars) = self.line_wide_chars.get(&line_col.line) {
150 for c in wide_chars.iter() {
151 if u32::from(c.end) <= line_col.col {
152 col = col.checked_sub(u32::from(c.len()) - c.wide_len(enc))?;
153 } else {
154 break;
157 }
158 }
159 }
160 Some(WideLineCol { line: line_col.line, col })
161 }
162
163 pub fn to_utf8(&self, enc: WideEncoding, line_col: WideLineCol) -> Option<LineCol> {
165 let mut col = line_col.col;
166 if let Some(wide_chars) = self.line_wide_chars.get(&line_col.line) {
167 for c in wide_chars.iter() {
168 if col > u32::from(c.start) {
169 col = col.checked_add(u32::from(c.len()) - c.wide_len(enc))?;
170 } else {
171 break;
174 }
175 }
176 }
177 Some(LineCol { line: line_col.line, col })
178 }
179
180 pub fn line(&self, line: u32) -> Option<TextRange> {
182 let start = self.start_offset(line as usize)?;
183 let next_newline = self.newlines.get(line as usize).copied().unwrap_or(self.len);
184 let line_length = next_newline - start;
185 Some(TextRange::new(start, start + line_length))
186 }
187
188 pub fn lines(&self, range: TextRange) -> impl Iterator<Item = TextRange> + '_ {
192 let lo = self.newlines.partition_point(|&it| it < range.start());
193 let hi = self.newlines.partition_point(|&it| it <= range.end());
194 let all = std::iter::once(range.start())
195 .chain(self.newlines[lo..hi].iter().copied())
196 .chain(std::iter::once(range.end()));
197
198 all.clone()
199 .zip(all.skip(1))
200 .map(|(lo, hi)| TextRange::new(lo, hi))
201 .filter(|it| !it.is_empty())
202 }
203
204 pub fn len(&self) -> TextSize {
206 self.len
207 }
208}
209
210fn analyze_source_file(src: &str) -> (Vec<TextSize>, IntMap<u32, Box<[WideChar]>>) {
212 assert!(src.len() < !0u32 as usize);
213 let mut lines = vec![];
214 let mut line_wide_chars = IntMap::<u32, Vec<WideChar>>::default();
215
216 analyze_source_file_dispatch(src, &mut lines, &mut line_wide_chars);
218
219 (lines, line_wide_chars.into_iter().map(|(k, v)| (k, v.into_boxed_slice())).collect())
220}
221
222#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
223fn analyze_source_file_dispatch(
224 src: &str,
225 lines: &mut Vec<TextSize>,
226 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
227) {
228 if is_x86_feature_detected!("sse2") {
229 unsafe {
231 analyze_source_file_sse2(src, lines, multi_byte_chars);
232 }
233 } else {
234 analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
235 }
236}
237
238#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
239fn analyze_source_file_dispatch(
240 src: &str,
241 lines: &mut Vec<TextSize>,
242 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
243) {
244 if std::arch::is_aarch64_feature_detected!("neon") {
245 unsafe {
247 analyze_source_file_neon(src, lines, multi_byte_chars);
248 }
249 } else {
250 analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
251 }
252}
253
254#[target_feature(enable = "sse2")]
259#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
260#[allow(unsafe_op_in_unsafe_fn)]
262unsafe fn analyze_source_file_sse2(
263 src: &str,
264 lines: &mut Vec<TextSize>,
265 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
266) {
267 #[cfg(target_arch = "x86")]
268 use std::arch::x86::*;
269 #[cfg(target_arch = "x86_64")]
270 use std::arch::x86_64::*;
271
272 const CHUNK_SIZE: usize = 16;
273
274 let src_bytes = src.as_bytes();
275
276 let chunk_count = src.len() / CHUNK_SIZE;
277
278 let mut intra_chunk_offset = 0;
283
284 for chunk_index in 0..chunk_count {
285 let ptr = src_bytes.as_ptr() as *const __m128i;
286 let chunk = unsafe { _mm_loadu_si128(ptr.add(chunk_index)) };
289
290 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
293 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
295
296 if multibyte_mask == 0 {
298 assert!(intra_chunk_offset == 0);
299
300 let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
302 let newlines_mask = _mm_movemask_epi8(newlines_test);
303
304 if newlines_mask != 0 {
305 let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
307 let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);
308
309 loop {
310 let index = newlines_mask.trailing_zeros();
311
312 if index >= CHUNK_SIZE as u32 {
313 break;
315 }
316
317 lines.push(TextSize::from(index) + output_offset);
318
319 newlines_mask &= (!1) << index;
321 }
322 }
323 continue;
324 }
325
326 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
329 intra_chunk_offset = analyze_source_file_generic(
330 &src[scan_start..],
331 CHUNK_SIZE - intra_chunk_offset,
332 TextSize::from(scan_start as u32),
333 lines,
334 multi_byte_chars,
335 );
336 }
337
338 let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
340 if tail_start < src.len() {
341 analyze_source_file_generic(
342 &src[tail_start..],
343 src.len() - tail_start,
344 TextSize::from(tail_start as u32),
345 lines,
346 multi_byte_chars,
347 );
348 }
349}
350
351#[target_feature(enable = "neon")]
352#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
353#[inline]
354#[allow(unsafe_op_in_unsafe_fn)]
361unsafe fn move_mask(v: std::arch::aarch64::uint8x16_t) -> u64 {
362 use std::arch::aarch64::*;
363
364 let nibble_mask = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
365 vget_lane_u64(vreinterpret_u64_u8(nibble_mask), 0)
366}
367
368#[target_feature(enable = "neon")]
369#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
370#[allow(unsafe_op_in_unsafe_fn)]
372unsafe fn analyze_source_file_neon(
373 src: &str,
374 lines: &mut Vec<TextSize>,
375 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
376) {
377 use std::arch::aarch64::*;
378
379 const CHUNK_SIZE: usize = 16;
380
381 let src_bytes = src.as_bytes();
382
383 let chunk_count = src.len() / CHUNK_SIZE;
384
385 let newline = vdupq_n_s8(b'\n' as i8);
386
387 let mut intra_chunk_offset = 0;
392
393 for chunk_index in 0..chunk_count {
394 let ptr = src_bytes.as_ptr() as *const i8;
395 let chunk = unsafe { vld1q_s8(ptr.add(chunk_index * CHUNK_SIZE)) };
396
397 let multibyte_test = vcltzq_s8(chunk);
400 let multibyte_mask = unsafe { move_mask(multibyte_test) };
402
403 if multibyte_mask == 0 {
405 assert!(intra_chunk_offset == 0);
406
407 let newlines_test = vceqq_s8(chunk, newline);
409 let mut newlines_mask = unsafe { move_mask(newlines_test) };
410
411 if newlines_mask != 0 {
413 let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);
414
415 while newlines_mask != 0 {
416 let trailing_zeros = newlines_mask.trailing_zeros();
417 let index = trailing_zeros / 4;
418
419 lines.push(TextSize::from(index) + output_offset);
420
421 newlines_mask &= (!0xF) << trailing_zeros;
423 }
424 }
425 continue;
426 }
427
428 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
429 intra_chunk_offset = analyze_source_file_generic(
430 &src[scan_start..],
431 CHUNK_SIZE - intra_chunk_offset,
432 TextSize::from(scan_start as u32),
433 lines,
434 multi_byte_chars,
435 );
436 }
437
438 let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
439 if tail_start < src.len() {
440 analyze_source_file_generic(
441 &src[tail_start..],
442 src.len() - tail_start,
443 TextSize::from(tail_start as u32),
444 lines,
445 multi_byte_chars,
446 );
447 }
448}
449
450#[cfg(not(any(
451 target_arch = "x86",
452 target_arch = "x86_64",
453 all(target_arch = "aarch64", target_endian = "little")
454)))]
455fn analyze_source_file_dispatch(
457 src: &str,
458 lines: &mut Vec<TextSize>,
459 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
460) {
461 analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
462}
463
464fn analyze_source_file_generic(
468 src: &str,
469 scan_len: usize,
470 output_offset: TextSize,
471 lines: &mut Vec<TextSize>,
472 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
473) -> usize {
474 assert!(src.len() >= scan_len);
475 let mut i = 0;
476 let src_bytes = src.as_bytes();
477
478 while i < scan_len {
479 let byte = unsafe {
480 *src_bytes.get_unchecked(i)
482 };
483
484 let mut char_len = 1;
487
488 if byte == b'\n' {
489 lines.push(TextSize::from(i as u32 + 1) + output_offset);
490 } else if byte >= 127 {
491 let c = src[i..].chars().next().unwrap();
493 char_len = c.len_utf8();
494
495 let pos = TextSize::from(i as u32) + output_offset
498 - lines.last().unwrap_or(&TextSize::default());
499
500 if char_len > 1 {
501 assert!((2..=4).contains(&char_len));
502 let mbc = WideChar { start: pos, end: pos + TextSize::from(char_len as u32) };
503 multi_byte_chars.entry(lines.len() as u32).or_default().push(mbc);
504 }
505 }
506
507 i += char_len;
508 }
509
510 i - scan_len
511}