@@ -95,59 +95,32 @@ cfg_match! {
9595 if multibyte_mask == 0 {
9696 assert!( intra_chunk_offset == 0 ) ;
9797
98- // Check if there are any control characters in the chunk. All
99- // control characters that we can encounter at this point have a
100- // byte value less than 32 or ...
101- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
102- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
103-
104- // ... it's the ASCII 'DEL' character with a value of 127.
105- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
106- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
107-
108- let control_char_mask = control_char_mask0 | control_char_mask1;
109-
110- if control_char_mask != 0 {
111- // Check for newlines in the chunk
112- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
113- let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
114-
115- if control_char_mask == newlines_mask {
116- // All control characters are newlines, record them
117- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
118-
119- while newlines_mask != 0 {
120- let index = newlines_mask. trailing_zeros( ) ;
121-
122- lines. push( RelativeBytePos ( index) + output_offset) ;
123-
124- // Clear the bit, so we can find the next one.
125- newlines_mask &= newlines_mask - 1 ;
126- }
127-
128- // We are done for this chunk. All control characters were
129- // newlines and we took care of those.
130- continue ;
131- } else {
132- // Some of the control characters are not newlines,
133- // fall through to the slow path below.
134- }
135- } else {
136- // No control characters, nothing to record for this chunk
137- continue ;
98+ // Check for newlines in the chunk
99+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
100+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
101+
102+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
103+
104+ while newlines_mask != 0 {
105+ let index = newlines_mask. trailing_zeros( ) ;
106+
107+ lines. push( RelativeBytePos ( index) + output_offset) ;
108+
109+ // Clear the bit, so we can find the next one.
110+ newlines_mask &= newlines_mask - 1 ;
138111 }
112+ } else {
113+ // The slow path.
114+ // There are multibyte chars in here, fallback to generic decoding.
115+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116+ intra_chunk_offset = analyze_source_file_generic(
117+ & src[ scan_start..] ,
118+ CHUNK_SIZE - intra_chunk_offset,
119+ RelativeBytePos :: from_usize( scan_start) ,
120+ lines,
121+ multi_byte_chars,
122+ ) ;
139123 }
140-
141- // The slow path.
142- // There are control chars in here, fallback to generic decoding.
143- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
144- intra_chunk_offset = analyze_source_file_generic(
145- & src[ scan_start..] ,
146- CHUNK_SIZE - intra_chunk_offset,
147- RelativeBytePos :: from_usize( scan_start) ,
148- lines,
149- multi_byte_chars,
150- ) ;
151124 }
152125
153126 // There might still be a tail left to analyze
@@ -247,59 +220,32 @@ cfg_match! {
247220 if multibyte_mask == 0 {
248221 assert!( intra_chunk_offset == 0 ) ;
249222
250- // Check if there are any control characters in the chunk. All
251- // control characters that we can encounter at this point have a
252- // byte value less than 32 or ...
253- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
254- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
255-
256- // ... it's the ASCII 'DEL' character with a value of 127.
257- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
258- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
259-
260- let control_char_mask = control_char_mask0 | control_char_mask1;
261-
262- if control_char_mask != 0 {
263- // Check for newlines in the chunk
264- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
265- let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
266-
267- if control_char_mask == newlines_mask {
268- // All control characters are newlines, record them
269- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
270-
271- while newlines_mask != 0 {
272- let index = newlines_mask. trailing_zeros( ) ;
273-
274- lines. push( RelativeBytePos ( index) + output_offset) ;
275-
276- // Clear the bit, so we can find the next one.
277- newlines_mask &= newlines_mask - 1 ;
278- }
279-
280- // We are done for this chunk. All control characters were
281- // newlines and we took care of those.
282- continue ;
283- } else {
284- // Some of the control characters are not newlines,
285- // fall through to the slow path below.
286- }
287- } else {
288- // No control characters, nothing to record for this chunk
289- continue ;
223+ // Check for newlines in the chunk
224+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
225+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
226+
227+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
228+
229+ while newlines_mask != 0 {
230+ let index = newlines_mask. trailing_zeros( ) ;
231+
232+ lines. push( RelativeBytePos ( index) + output_offset) ;
233+
234+ // Clear the bit, so we can find the next one.
235+ newlines_mask &= newlines_mask - 1 ;
290236 }
237+ } else {
238+ // The slow path.
239+ // There are multibyte chars in here, fallback to generic decoding.
240+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241+ intra_chunk_offset = analyze_source_file_generic(
242+ & src[ scan_start..] ,
243+ CHUNK_SIZE - intra_chunk_offset,
244+ RelativeBytePos :: from_usize( scan_start) ,
245+ lines,
246+ multi_byte_chars,
247+ ) ;
291248 }
292-
293- // The slow path.
294- // There are control chars in here, fallback to generic decoding.
295- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
296- intra_chunk_offset = analyze_source_file_generic(
297- & src[ scan_start..] ,
298- CHUNK_SIZE - intra_chunk_offset,
299- RelativeBytePos :: from_usize( scan_start) ,
300- lines,
301- multi_byte_chars,
302- ) ;
303249 }
304250
305251 // There might still be a tail left to analyze
@@ -357,29 +303,18 @@ fn analyze_source_file_generic(
357303 // string.
358304 let mut char_len = 1 ;
359305
360- if byte < 32 {
361- // This is an ASCII control character, it could be one of the cases
362- // that are interesting to us.
363-
306+ if byte == b'\n' {
364307 let pos = RelativeBytePos :: from_usize ( i) + output_offset;
365-
366- if let b'\n' = byte {
367- lines. push ( pos + RelativeBytePos ( 1 ) ) ;
368- }
369- } else if byte >= 127 {
370- // The slow path:
371- // This is either ASCII control character "DEL" or the beginning of
372- // a multibyte char. Just decode to `char`.
308+ lines. push ( pos + RelativeBytePos ( 1 ) ) ;
309+ } else if byte >= 128 {
310+ // This is the beginning of a multibyte char. Just decode to `char`.
373311 let c = src[ i..] . chars ( ) . next ( ) . unwrap ( ) ;
374312 char_len = c. len_utf8 ( ) ;
375313
376314 let pos = RelativeBytePos :: from_usize ( i) + output_offset;
377-
378- if char_len > 1 {
379- assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
380- let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
381- multi_byte_chars. push ( mbc) ;
382- }
315+ assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
316+ let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
317+ multi_byte_chars. push ( mbc) ;
383318 }
384319
385320 i += char_len;
0 commit comments