Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 585f313

Browse files
committedJan 25, 2024
Implement RFC 3349, mixed utf8 literals.
Specifically: - Allow unicode chars in b"" and br"" literals. This is done by changing `Mode::allow_unicode_chars` to succeed on `ByteStr` and `RawByteStr`. - Allow unicode escapes in b"" literals. This is done by changing `Mode::allow_unicode_escapes` to succeed on `ByteStr`. Byte string literals can already have high bytes (`\x80`..`\xff`). Because they now also support unicode chars, they can now be mixed utf8, so we use `unescape_mixed`/`cook_mixed` instead of `unescape_unicode`/`cook_unicode` to process them. A new type `Rfc3349`, is used to implement the feature gating. Values of that type are threaded through the unescaping code to track whether rules from rfc3349 are required for unescaping to succeed. Test changes: - tests/ui/mixed-utf8-literals/basic.rs: new `check-pass` UI test with various literals exercising the new forms. - tests/ui/attributes/key-value-non-ascii.rs: changed from a byte string literal to a byte literal; we just need some kind of problem with a literal to preserve the test's intent. - tests/ui/parser/raw/raw-byte-string-literals.rs: moved the raw byte string literal with a non-ASCII char to `basic.rs`. - tests/ui/parser/byte-string-literals.rs: similar. - tests/ui/parser/issues/issue-23620-invalid-escapes.rs: moved one case fully to `basic.rs`, and one partially. - tests/ui/parser/unicode-control-codepoints.rs: left the code unchanged, but the errors are now about mixed-utf8-literals being feature gated. - tests/ui/suggestions/multibyte-escapes.rs: moved one case to `basic.rs`. - compiler/rustc_lexer/src/unescape/tests.rs: various adjustments - two cases that previously failed now succeed - added some more cases for the newly supported syntax I wasn't sure how to handle rust-analyzer in general, so I just allowed mixed utf8 literals everywhere without complaint.
1 parent 6be2e56 commit 585f313

File tree

29 files changed

+364
-348
lines changed

29 files changed

+364
-348
lines changed
 

‎compiler/rustc_ast/src/util/literal.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
44
use crate::token::{self, Token};
55
use rustc_lexer::unescape::{
6-
byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
6+
unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
77
};
88
use rustc_span::symbol::{kw, sym, Symbol};
99
use rustc_span::Span;
@@ -49,7 +49,8 @@ impl LitKind {
4949

5050
// For byte/char/string literals, chars and escapes have already been
5151
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
52-
// chars and escapes are valid here.
52+
// chars and escapes are valid here, and ignore `Rfc3349` return
53+
// values.
5354
Ok(match kind {
5455
token::Bool => {
5556
assert!(symbol.is_bool_lit());
@@ -84,7 +85,7 @@ impl LitKind {
8485
// Force-inlining here is aggressive but the closure is
8586
// called on every char in the string, so it can be hot in
8687
// programs with many long strings containing escapes.
87-
unescape_unicode(
88+
_ = unescape_unicode(
8889
s,
8990
Mode::Str,
9091
&mut #[inline(always)]
@@ -108,8 +109,11 @@ impl LitKind {
108109
token::ByteStr => {
109110
let s = symbol.as_str();
110111
let mut buf = Vec::with_capacity(s.len());
111-
unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
112-
Ok(c) => buf.push(byte_from_char(c)),
112+
_ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c {
113+
Ok(MixedUnit::Char(c)) => {
114+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
115+
}
116+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
113117
Err(err) => {
114118
assert!(!err.is_fatal(), "failed to unescape string literal")
115119
}
@@ -125,7 +129,7 @@ impl LitKind {
125129
token::CStr => {
126130
let s = symbol.as_str();
127131
let mut buf = Vec::with_capacity(s.len());
128-
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
132+
_ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
129133
Ok(MixedUnit::Char(c)) => {
130134
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
131135
}

‎compiler/rustc_ast_passes/src/feature_gate.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
508508
}
509509
};
510510
}
511+
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#);
511512
gate_all!(
512513
if_let_guard,
513514
"`if let` guards are experimental",

‎compiler/rustc_feature/src/unstable.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,8 @@ declare_features! (
520520
/// standard library until the soundness issues with specialization
521521
/// are fixed.
522522
(unstable, min_specialization, "1.7.0", Some(31844)),
523+
/// Allows mixed utf8 b"" and br"" literals.
524+
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
523525
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
524526
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
525527
/// Allows the `#[must_not_suspend]` attribute.

‎compiler/rustc_lexer/src/unescape.rs

Lines changed: 64 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ impl EscapeError {
8585
///
8686
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8787
/// the callback will be called exactly once.
88-
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
88+
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
8989
where
9090
F: FnMut(Range<usize>, Result<char, EscapeError>),
9191
{
@@ -94,16 +94,17 @@ where
9494
let mut chars = src.chars();
9595
let res = unescape_char_or_byte(&mut chars, mode);
9696
callback(0..(src.len() - chars.as_str().len()), res);
97+
Rfc3349::Unused // rfc3349 not relevant for `Mode::{Char,Byte}`
9798
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99+
Str => unescape_non_raw_common(src, mode, callback),
99100
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100101
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
101102
if let Ok('\0') = result {
102103
result = Err(EscapeError::NulInCStr);
103104
}
104105
callback(r, result)
105106
}),
106-
CStr => unreachable!(),
107+
ByteStr | CStr => unreachable!(),
107108
}
108109
}
109110

@@ -142,18 +143,19 @@ impl From<u8> for MixedUnit {
142143
/// a sequence of escaped characters or errors.
143144
///
144145
/// Values are returned by invoking `callback`.
145-
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
146+
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
146147
where
147148
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
148149
{
149150
match mode {
151+
ByteStr => unescape_non_raw_common(src, mode, callback),
150152
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
151153
if let Ok(MixedUnit::Char('\0')) = result {
152154
result = Err(EscapeError::NulInCStr);
153155
}
154156
callback(r, result)
155157
}),
156-
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
158+
Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable!(),
157159
}
158160
}
159161

@@ -169,6 +171,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169171
unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
170172
}
171173

174+
/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
175+
/// literal to be valid. Once rfc3349 is stabilized this type can be removed.
176+
#[derive(Debug, PartialEq)]
177+
#[must_use]
178+
pub enum Rfc3349 {
179+
Used,
180+
Unused,
181+
}
182+
172183
/// What kind of literal do we parse.
173184
#[derive(Debug, Clone, Copy, PartialEq)]
174185
pub enum Mode {
@@ -205,17 +216,25 @@ impl Mode {
205216

206217
/// Are unicode (non-ASCII) chars allowed?
207218
#[inline]
208-
fn allow_unicode_chars(self) -> bool {
219+
fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool {
209220
match self {
210-
Byte | ByteStr | RawByteStr => false,
221+
Byte => false,
222+
ByteStr | RawByteStr => {
223+
*rfc3349 = Rfc3349::Used;
224+
true
225+
}
211226
Char | Str | RawStr | CStr | RawCStr => true,
212227
}
213228
}
214229

215230
/// Are unicode escapes (`\u`) allowed?
216-
fn allow_unicode_escapes(self) -> bool {
231+
fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool {
217232
match self {
218-
Byte | ByteStr => false,
233+
Byte => false,
234+
ByteStr => {
235+
*rfc3349 = Rfc3349::Used;
236+
true
237+
}
219238
Char | Str | CStr => true,
220239
RawByteStr | RawStr | RawCStr => unreachable!(),
221240
}
@@ -233,6 +252,7 @@ impl Mode {
233252
fn scan_escape<T: From<char> + From<u8>>(
234253
chars: &mut Chars<'_>,
235254
mode: Mode,
255+
rfc3349: &mut Rfc3349,
236256
) -> Result<T, EscapeError> {
237257
// Previous character was '\\', unescape what follows.
238258
let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
@@ -262,13 +282,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262282
Ok(T::from(value as u8))
263283
};
264284
}
265-
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
285+
'u' => return scan_unicode(chars, mode, rfc3349).map(T::from),
266286
_ => return Err(EscapeError::InvalidEscape),
267287
};
268288
Ok(T::from(res))
269289
}
270290

271-
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
291+
fn scan_unicode(
292+
chars: &mut Chars<'_>,
293+
mode: Mode,
294+
rfc3349: &mut Rfc3349,
295+
) -> Result<char, EscapeError> {
272296
// We've parsed '\u', now we have to parse '{..}'.
273297

274298
if chars.next() != Some('{') {
@@ -296,7 +320,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296320

297321
// Incorrect syntax has higher priority for error reporting
298322
// than unallowed value for a literal.
299-
if !allow_unicode_escapes {
323+
if !mode.allow_unicode_escapes(rfc3349) {
300324
return Err(EscapeError::UnicodeEscapeInByte);
301325
}
302326

@@ -322,18 +346,27 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322346
}
323347

324348
#[inline]
325-
fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
326-
if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
349+
fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result<char, EscapeError> {
350+
// We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
351+
if c.is_ascii() || mode.allow_unicode_chars(rfc3349) {
352+
Ok(c)
353+
} else {
354+
Err(EscapeError::NonAsciiCharInByte)
355+
}
327356
}
328357

329358
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
330359
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
360+
let mut rfc3349 = Rfc3349::Unused;
331361
let res = match c {
332-
'\\' => scan_escape(chars, mode),
362+
'\\' => scan_escape(chars, mode, &mut rfc3349),
333363
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
334364
'\r' => Err(EscapeError::BareCarriageReturn),
335-
_ => ascii_check(c, mode.allow_unicode_chars()),
365+
_ => ascii_check(c, mode, &mut rfc3349),
336366
}?;
367+
368+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::{Char,Byte}`
369+
337370
if chars.next().is_some() {
338371
return Err(EscapeError::MoreThanOneChar);
339372
}
@@ -342,12 +375,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342375

343376
/// Takes a contents of a string literal (without quotes) and produces a
344377
/// sequence of escaped characters or errors.
345-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
378+
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(
379+
src: &str,
380+
mode: Mode,
381+
callback: &mut F,
382+
) -> Rfc3349
346383
where
347384
F: FnMut(Range<usize>, Result<T, EscapeError>),
348385
{
349386
let mut chars = src.chars();
350-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
387+
let mut rfc3349 = Rfc3349::Unused;
351388

352389
// The `start` and `end` computation here is complicated because
353390
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +404,17 @@ where
367404
});
368405
continue;
369406
}
370-
_ => scan_escape::<T>(&mut chars, mode),
407+
_ => scan_escape::<T>(&mut chars, mode, &mut rfc3349),
371408
}
372409
}
373410
'"' => Err(EscapeError::EscapeOnlyChar),
374411
'\r' => Err(EscapeError::BareCarriageReturn),
375-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
412+
_ => ascii_check(c, mode, &mut rfc3349).map(T::from),
376413
};
377414
let end = src.len() - chars.as_str().len();
378415
callback(start..end, res);
379416
}
417+
rfc3349
380418
}
381419

382420
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
@@ -409,12 +447,12 @@ where
409447
/// sequence of characters or errors.
410448
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411449
/// only produce errors on bare CR.
412-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
450+
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
413451
where
414452
F: FnMut(Range<usize>, Result<char, EscapeError>),
415453
{
416454
let mut chars = src.chars();
417-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
455+
let mut rfc3349 = Rfc3349::Unused;
418456

419457
// The `start` and `end` computation here matches the one in
420458
// `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +461,17 @@ where
423461
let start = src.len() - chars.as_str().len() - c.len_utf8();
424462
let res = match c {
425463
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
426-
_ => ascii_check(c, allow_unicode_chars),
464+
_ => ascii_check(c, mode, &mut rfc3349),
427465
};
428466
let end = src.len() - chars.as_str().len();
429467
callback(start..end, res);
430468
}
469+
rfc3349
431470
}
432471

433472
#[inline]
434-
pub fn byte_from_char(c: char) -> u8 {
473+
pub(crate) fn byte_from_char(c: char) -> u8 {
435474
let res = c as u32;
436-
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
475+
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
437476
res as u8
438477
}

‎compiler/rustc_lexer/src/unescape/tests.rs

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ fn test_unescape_char_good() {
100100
fn test_unescape_str_warn() {
101101
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
102102
let mut unescaped = Vec::with_capacity(literal.len());
103-
unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
103+
let rfc3349 =
104+
unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
105+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str`
104106
assert_eq!(unescaped, expected);
105107
}
106108

@@ -124,14 +126,15 @@ fn test_unescape_str_warn() {
124126
fn test_unescape_str_good() {
125127
fn check(literal_text: &str, expected: &str) {
126128
let mut buf = Ok(String::with_capacity(literal_text.len()));
127-
unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
129+
let rfc3349 = unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
128130
if let Ok(b) = &mut buf {
129131
match c {
130132
Ok(c) => b.push(c),
131133
Err(e) => buf = Err((range, e)),
132134
}
133135
}
134136
});
137+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str`
135138
assert_eq!(buf.as_deref(), Ok(expected))
136139
}
137140

@@ -239,32 +242,43 @@ fn test_unescape_byte_good() {
239242

240243
#[test]
241244
fn test_unescape_byte_str_good() {
242-
fn check(literal_text: &str, expected: &[u8]) {
243-
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
244-
unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
245-
if let Ok(b) = &mut buf {
245+
fn check(literal_text: &str, expected: &[u8], rfc3349_expected: Rfc3349) {
246+
let mut buf_res = Ok(Vec::with_capacity(literal_text.len()));
247+
let rfc3349_actual = unescape_mixed(literal_text, Mode::ByteStr, &mut |range, c| {
248+
if let Ok(buf) = &mut buf_res {
246249
match c {
247-
Ok(c) => b.push(byte_from_char(c)),
248-
Err(e) => buf = Err((range, e)),
250+
Ok(MixedUnit::Char(c)) => {
251+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
252+
}
253+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
254+
Err(e) => buf_res = Err((range, e)),
249255
}
250256
}
251257
});
252-
assert_eq!(buf.as_deref(), Ok(expected))
258+
assert_eq!(rfc3349_actual, rfc3349_expected);
259+
assert_eq!(buf_res.as_deref(), Ok(expected))
253260
}
254261

255-
check("foo", b"foo");
256-
check("", b"");
257-
check(" \t\n", b" \t\n");
262+
check("foo", b"foo", Rfc3349::Unused);
263+
check("", b"", Rfc3349::Unused);
264+
check(" \t\n", b" \t\n", Rfc3349::Unused);
265+
266+
check("hello \\\n world", b"hello world", Rfc3349::Unused);
267+
check("thread's", b"thread's", Rfc3349::Unused);
258268

259-
check("hello \\\n world", b"hello world");
260-
check("thread's", b"thread's")
269+
let a_pound_up_smiley = &[0x61, 0xc2, 0xa3, 0xe2, 0x86, 0x91, 0xf0, 0x9f, 0x98, 0x80];
270+
check("a£↑😀", a_pound_up_smiley, Rfc3349::Used);
271+
check(r"\u{61}\u{a3}\u{2191}\u{1f600}", a_pound_up_smiley, Rfc3349::Used);
272+
check(r"\x00\x7f\x80\xffa¥", &[0, 0x7f, 0x80, 0xff, 0x61, 0xc2, 0xa5], Rfc3349::Used);
261273
}
262274

263275
#[test]
264276
fn test_unescape_raw_str() {
265277
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
266278
let mut unescaped = Vec::with_capacity(literal.len());
267-
unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
279+
let rfc3349 =
280+
unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
281+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::RawStr`
268282
assert_eq!(unescaped, expected);
269283
}
270284

@@ -274,13 +288,20 @@ fn test_unescape_raw_str() {
274288

275289
#[test]
276290
fn test_unescape_raw_byte_str() {
277-
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
291+
fn check(
292+
literal: &str,
293+
expected: &[(Range<usize>, Result<char, EscapeError>)],
294+
rfc3349_expected: Rfc3349,
295+
) {
278296
let mut unescaped = Vec::with_capacity(literal.len());
279-
unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
297+
let rfc3349_actual = unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| {
298+
unescaped.push((range, res))
299+
});
300+
assert_eq!(rfc3349_actual, rfc3349_expected);
280301
assert_eq!(unescaped, expected);
281302
}
282303

283-
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
284-
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]);
285-
check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]);
304+
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))], Rfc3349::Unused);
305+
check("🦀", &[(0..4, Ok('🦀'))], Rfc3349::Used);
306+
check("¥a", &[(0..2, Ok('¥')), (2..3, Ok('a'))], Rfc3349::Used);
286307
}

‎compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
88
use rustc_ast::tokenstream::TokenStream;
99
use rustc_ast::util::unicode::contains_text_flow_control_chars;
1010
use rustc_errors::{error_code, Applicability, DiagCtxt, DiagnosticBuilder, StashKey};
11-
use rustc_lexer::unescape::{self, EscapeError, Mode};
12-
use rustc_lexer::{Base, DocStyle, RawStrError};
13-
use rustc_lexer::{Cursor, LiteralKind};
11+
use rustc_lexer::unescape::{self, EscapeError, Mode, Rfc3349};
12+
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
1413
use rustc_session::lint::builtin::{
1514
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
1615
};
@@ -436,7 +435,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
436435
.with_code(error_code!(E0766))
437436
.emit()
438437
}
439-
self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
438+
self.cook_mixed(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
440439
}
441440
rustc_lexer::LiteralKind::CStr { terminated } => {
442441
if !terminated {
@@ -697,13 +696,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
697696
end: BytePos,
698697
prefix_len: u32,
699698
postfix_len: u32,
700-
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
699+
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)) -> Rfc3349,
701700
) -> (token::LitKind, Symbol) {
702701
let mut has_fatal_err = false;
703702
let content_start = start + BytePos(prefix_len);
704703
let content_end = end - BytePos(postfix_len);
705704
let lit_content = self.str_from_to(content_start, content_end);
706-
unescape(lit_content, mode, &mut |range, result| {
705+
let rfc3349 = unescape(lit_content, mode, &mut |range, result| {
707706
// Here we only check for errors. The actual unescaping is done later.
708707
if let Err(err) = result {
709708
let span_with_quotes = self.mk_sp(start, end);
@@ -725,6 +724,9 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
725724
);
726725
}
727726
});
727+
if rfc3349 == Rfc3349::Used {
728+
self.sess.gated_spans.gate(sym::mixed_utf8_literals, self.mk_sp(start, end));
729+
}
728730

729731
// We normally exclude the quotes for the symbol, but for errors we
730732
// include it because it results in clearer error messages.

‎compiler/rustc_parse/src/lexer/unescape_error_reporting.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ pub(crate) fn emit_unescape_error(
175175
EscapeError::NonAsciiCharInByte => {
176176
let (c, span) = last_char();
177177
let desc = match mode {
178+
// Note: once rfc3349 stabilizes, only `Mode::Byte` will be reachable here.
178179
Mode::Byte => "byte literal",
179180
Mode::ByteStr => "byte string literal",
180181
Mode::RawByteStr => "raw byte string literal",
@@ -188,7 +189,7 @@ pub(crate) fn emit_unescape_error(
188189
};
189190
err.span_label(span, format!("must be ASCII{postfix}"));
190191
// Note: the \\xHH suggestions are not given for raw byte string
191-
// literals, because they are araw and so cannot use any escapes.
192+
// literals, because they cannot use escapes.
192193
if (c as u32) <= 0xFF && mode != Mode::RawByteStr {
193194
err.span_suggestion(
194195
span,

‎compiler/rustc_parse_format/src/lib.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,13 +1056,14 @@ fn find_width_map_from_snippet(
10561056
fn unescape_string(string: &str) -> Option<string::String> {
10571057
let mut buf = string::String::new();
10581058
let mut ok = true;
1059-
unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| {
1060-
match unescaped_char {
1061-
Ok(c) => buf.push(c),
1062-
Err(_) => ok = false,
1063-
}
1064-
});
1065-
1059+
let rfc3349 =
1060+
unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| {
1061+
match unescaped_char {
1062+
Ok(c) => buf.push(c),
1063+
Err(_) => ok = false,
1064+
}
1065+
});
1066+
assert_eq!(rfc3349, unescape::Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str`
10661067
ok.then_some(buf)
10671068
}
10681069

‎compiler/rustc_span/src/symbol.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,7 @@ symbols! {
10601060
mir_unwind_unreachable,
10611061
mir_variant,
10621062
miri,
1063+
mixed_utf8_literals,
10631064
mmx_reg,
10641065
modifiers,
10651066
module,
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# `mixed_utf8_literals`
2+
3+
The tracking issue for this feature is: [#116907]
4+
5+
[#116907]: https://github.com/rust-lang/rust/issues/116907
6+
7+
------------------------
8+
9+
This feature extends the syntax of string literals in the following ways.
10+
- Byte string literals can contain unicode chars (e.g. `b"🦀"`) and unicode
11+
escapes (e.g. `b"\u{1f980}"`.
12+
- Raw byte string literals can contain unicode chars (e.g. `br"🦀"`).
13+
14+
This makes it easier to work with strings that are mostly UTF-8 encoded but
15+
also contain some non UTF-8 bytes, which are sometimes called "conventionally
16+
UTF-8" strings.

‎src/tools/rust-analyzer/crates/parser/src/lexed_str.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,9 +365,11 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
365365
EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
366366
"non-ASCII character in byte literal"
367367
}
368+
// Note: once rfc3349 stabilizes, this arm will be unreachable.
368369
EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
369370
"non-ASCII character in byte string literal"
370371
}
372+
// Note: once rfc3349 stabilizes, this arm will be unreachable.
371373
EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
372374
EscapeError::NulInCStr => "null character in C string literal",
373375
EscapeError::UnskippedWhitespaceWarning => "",
@@ -378,15 +380,17 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
378380
fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str {
379381
let mut error_message = "";
380382
match mode {
381-
Mode::CStr => {
382-
rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| {
383+
Mode::ByteStr | Mode::CStr => {
384+
// Can ignore the `Rfc3349` return value.
385+
_ = rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| {
383386
if let Err(e) = res {
384387
error_message = error_to_diagnostic_message(e, mode);
385388
}
386389
});
387390
}
388-
Mode::ByteStr | Mode::Str => {
389-
rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| {
391+
Mode::Str => {
392+
// Can ignore the `Rfc3349` return value.
393+
_ = rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| {
390394
if let Err(e) = res {
391395
error_message = error_to_diagnostic_message(e, mode);
392396
}

‎src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs

Lines changed: 79 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ pub trait IsString: AstToken {
193193
let text = &self.text()[text_range_no_quotes - start];
194194
let offset = text_range_no_quotes.start() - start;
195195

196-
unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| {
196+
// Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals.
197+
_ = unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| {
197198
let text_range =
198199
TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
199200
cb(text_range + offset, unescaped_char);
@@ -226,7 +227,8 @@ impl ast::String {
226227
let mut buf = String::new();
227228
let mut prev_end = 0;
228229
let mut has_error = false;
229-
unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
230+
// Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals.
231+
_ = unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
230232
unescaped_char,
231233
buf.capacity() == 0,
232234
) {
@@ -253,44 +255,18 @@ impl ast::String {
253255
impl IsString for ast::ByteString {
254256
const RAW_PREFIX: &'static str = "br";
255257
const MODE: Mode = Mode::ByteStr;
258+
259+
fn escaped_char_ranges(
260+
&self,
261+
cb: &mut dyn FnMut(TextRange, Result<char, rustc_lexer::unescape::EscapeError>),
262+
) {
263+
escaped_char_ranges_impl(self, cb);
264+
}
256265
}
257266

258267
impl ast::ByteString {
259268
pub fn value(&self) -> Option<Cow<'_, [u8]>> {
260-
if self.is_raw() {
261-
let text = self.text();
262-
let text =
263-
&text[self.text_range_between_quotes()? - self.syntax().text_range().start()];
264-
return Some(Cow::Borrowed(text.as_bytes()));
265-
}
266-
267-
let text = self.text();
268-
let text = &text[self.text_range_between_quotes()? - self.syntax().text_range().start()];
269-
270-
let mut buf: Vec<u8> = Vec::new();
271-
let mut prev_end = 0;
272-
let mut has_error = false;
273-
unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
274-
unescaped_char,
275-
buf.capacity() == 0,
276-
) {
277-
(Ok(c), false) => buf.push(c as u8),
278-
(Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => {
279-
prev_end = char_range.end
280-
}
281-
(Ok(c), true) => {
282-
buf.reserve_exact(text.len());
283-
buf.extend_from_slice(text[..prev_end].as_bytes());
284-
buf.push(c as u8);
285-
}
286-
(Err(_), _) => has_error = true,
287-
});
288-
289-
match (has_error, buf.capacity() == 0) {
290-
(true, _) => None,
291-
(false, true) => Some(Cow::Borrowed(text.as_bytes())),
292-
(false, false) => Some(Cow::Owned(buf)),
293-
}
269+
value_impl(self)
294270
}
295271
}
296272

@@ -302,65 +278,13 @@ impl IsString for ast::CString {
302278
&self,
303279
cb: &mut dyn FnMut(TextRange, Result<char, rustc_lexer::unescape::EscapeError>),
304280
) {
305-
let text_range_no_quotes = match self.text_range_between_quotes() {
306-
Some(it) => it,
307-
None => return,
308-
};
309-
310-
let start = self.syntax().text_range().start();
311-
let text = &self.text()[text_range_no_quotes - start];
312-
let offset = text_range_no_quotes.start() - start;
313-
314-
unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| {
315-
let text_range =
316-
TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
317-
// XXX: This method should only be used for highlighting ranges. The unescaped
318-
// char/byte is not used. For simplicity, we return an arbitrary placeholder char.
319-
cb(text_range + offset, unescaped_char.map(|_| ' '));
320-
});
281+
escaped_char_ranges_impl(self, cb);
321282
}
322283
}
323284

324285
impl ast::CString {
325286
pub fn value(&self) -> Option<Cow<'_, [u8]>> {
326-
if self.is_raw() {
327-
let text = self.text();
328-
let text =
329-
&text[self.text_range_between_quotes()? - self.syntax().text_range().start()];
330-
return Some(Cow::Borrowed(text.as_bytes()));
331-
}
332-
333-
let text = self.text();
334-
let text = &text[self.text_range_between_quotes()? - self.syntax().text_range().start()];
335-
336-
let mut buf = Vec::new();
337-
let mut prev_end = 0;
338-
let mut has_error = false;
339-
let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit {
340-
MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
341-
MixedUnit::HighByte(b) => buf.push(b),
342-
};
343-
unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match (
344-
unescaped,
345-
buf.capacity() == 0,
346-
) {
347-
(Ok(u), false) => extend_unit(&mut buf, u),
348-
(Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => {
349-
prev_end = char_range.end
350-
}
351-
(Ok(u), true) => {
352-
buf.reserve_exact(text.len());
353-
buf.extend(text[..prev_end].as_bytes());
354-
extend_unit(&mut buf, u);
355-
}
356-
(Err(_), _) => has_error = true,
357-
});
358-
359-
match (has_error, buf.capacity() == 0) {
360-
(true, _) => None,
361-
(false, true) => Some(Cow::Borrowed(text.as_bytes())),
362-
(false, false) => Some(Cow::Owned(buf)),
363-
}
287+
value_impl(self)
364288
}
365289
}
366290

@@ -457,6 +381,71 @@ impl ast::FloatNumber {
457381
}
458382
}
459383

384+
fn escaped_char_ranges_impl<I: IsString>(
385+
this: &I,
386+
cb: &mut dyn FnMut(TextRange, Result<char, rustc_lexer::unescape::EscapeError>),
387+
) {
388+
let text_range_no_quotes = match this.text_range_between_quotes() {
389+
Some(it) => it,
390+
None => return,
391+
};
392+
393+
let start = this.syntax().text_range().start();
394+
let text = &this.text()[text_range_no_quotes - start];
395+
let offset = text_range_no_quotes.start() - start;
396+
397+
// Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals.
398+
_ = unescape_mixed(text, I::MODE, &mut |range, unescaped_char| {
399+
let text_range =
400+
TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
401+
// XXX: This method should only be used for highlighting ranges. The unescaped
402+
// char/byte is not used. For simplicity, we return an arbitrary placeholder char.
403+
cb(text_range + offset, unescaped_char.map(|_| ' '));
404+
});
405+
}
406+
407+
fn value_impl<I: IsString>(this: &I) -> Option<Cow<'_, [u8]>> {
408+
if this.is_raw() {
409+
let text = this.text();
410+
let text =
411+
&text[this.text_range_between_quotes()? - this.syntax().text_range().start()];
412+
return Some(Cow::Borrowed(text.as_bytes()));
413+
}
414+
415+
let text = this.text();
416+
let text = &text[this.text_range_between_quotes()? - this.syntax().text_range().start()];
417+
418+
let mut buf: Vec<u8> = Vec::new();
419+
let mut prev_end = 0;
420+
let mut has_error = false;
421+
let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit {
422+
MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
423+
MixedUnit::HighByte(b) => buf.push(b),
424+
};
425+
// Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals.
426+
_ = unescape_mixed(text, I::MODE, &mut |char_range, unescaped_char| match (
427+
unescaped_char,
428+
buf.capacity() == 0,
429+
) {
430+
(Ok(u), false) => extend_unit(&mut buf, u),
431+
(Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => {
432+
prev_end = char_range.end
433+
}
434+
(Ok(u), true) => {
435+
buf.reserve_exact(text.len());
436+
buf.extend(text[..prev_end].as_bytes());
437+
extend_unit(&mut buf, u);
438+
}
439+
(Err(_), _) => has_error = true,
440+
});
441+
442+
match (has_error, buf.capacity() == 0) {
443+
(true, _) => None,
444+
(false, true) => Some(Cow::Borrowed(text.as_bytes())),
445+
(false, false) => Some(Cow::Owned(buf)),
446+
}
447+
}
448+
460449
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
461450
pub enum Radix {
462451
Binary = 2,

‎src/tools/rust-analyzer/crates/syntax/src/validation.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,13 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
136136
}
137137
};
138138

139+
// Ignores the `Rfc3349` return value from the `unescape_*` functions, thus
140+
// permitting mixed utf8 literals.
139141
match literal.kind() {
140142
ast::LiteralKind::String(s) => {
141143
if !s.is_raw() {
142144
if let Some(without_quotes) = unquote(text, 1, '"') {
143-
unescape_unicode(without_quotes, Mode::Str, &mut |range, char| {
145+
_ = unescape_unicode(without_quotes, Mode::Str, &mut |range, char| {
144146
if let Err(err) = char {
145147
push_err(1, range.start, err);
146148
}
@@ -151,7 +153,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
151153
ast::LiteralKind::ByteString(s) => {
152154
if !s.is_raw() {
153155
if let Some(without_quotes) = unquote(text, 2, '"') {
154-
unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| {
156+
_ = unescape_mixed(without_quotes, Mode::ByteStr, &mut |range, char| {
155157
if let Err(err) = char {
156158
push_err(1, range.start, err);
157159
}
@@ -162,7 +164,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
162164
ast::LiteralKind::CString(s) => {
163165
if !s.is_raw() {
164166
if let Some(without_quotes) = unquote(text, 2, '"') {
165-
unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| {
167+
_ = unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| {
166168
if let Err(err) = char {
167169
push_err(1, range.start, err);
168170
}
@@ -172,7 +174,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
172174
}
173175
ast::LiteralKind::Char(_) => {
174176
if let Some(without_quotes) = unquote(text, 1, '\'') {
175-
unescape_unicode(without_quotes, Mode::Char, &mut |range, char| {
177+
_ = unescape_unicode(without_quotes, Mode::Char, &mut |range, char| {
176178
if let Err(err) = char {
177179
push_err(1, range.start, err);
178180
}
@@ -181,7 +183,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
181183
}
182184
ast::LiteralKind::Byte(_) => {
183185
if let Some(without_quotes) = unquote(text, 2, '\'') {
184-
unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| {
186+
_ = unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| {
185187
if let Err(err) = char {
186188
push_err(2, range.start, err);
187189
}

‎src/tools/tidy/src/ui_tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use std::path::{Path, PathBuf};
1111
const ENTRY_LIMIT: usize = 900;
1212
// FIXME: The following limits should be reduced eventually.
1313
const ISSUES_ENTRY_LIMIT: usize = 1849;
14-
const ROOT_ENTRY_LIMIT: usize = 870;
14+
const ROOT_ENTRY_LIMIT: usize = 871;
1515

1616
const EXPECTED_TEST_FILE_EXTENSIONS: &[&str] = &[
1717
"rs", // test source files
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#![feature(rustc_attrs)]
22

3-
#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal
3+
#[rustc_dummy = b'ffi'] //~ ERROR non-ASCII character in byte literal
44
fn main() {}
Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
error: non-ASCII character in byte string literal
1+
error: non-ASCII character in byte literal
22
--> $DIR/key-value-non-ascii.rs:3:19
33
|
4-
LL | #[rustc_dummy = b"ffi.rs"]
5-
| ^ must be ASCII
6-
|
7-
help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes
8-
|
9-
LL | #[rustc_dummy = b"/xEF/xAC/x83.rs"]
10-
| ~~~~~~~~~~~~
4+
LL | #[rustc_dummy = b'ffi']
5+
| ^
6+
| |
7+
| must be ASCII
8+
| this multibyte character does not fit into a single byte
119

1210
error: aborting due to 1 previous error
1311

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fn main() {
2+
_ = b"a¥🦀"; //~ ERROR mixed utf8
3+
_ = br"a¥🦀"; //~ ERROR mixed utf8
4+
_ = b"a\u{a5}\u{1f980}"; //~ ERROR mixed utf8
5+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
error[E0658]: mixed utf8 b"" and br"" literals are experimental
2+
--> $DIR/feature-gate-mixed-utf8-literals.rs:2:9
3+
|
4+
LL | _ = b"a¥🦀";
5+
| ^^^^^^^
6+
|
7+
= note: see issue #116907 <https://github.com/rust-lang/rust/issues/116907> for more information
8+
= help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable
9+
= note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date
10+
11+
error[E0658]: mixed utf8 b"" and br"" literals are experimental
12+
--> $DIR/feature-gate-mixed-utf8-literals.rs:3:9
13+
|
14+
LL | _ = br"a¥🦀";
15+
| ^^^^^^^^
16+
|
17+
= note: see issue #116907 <https://github.com/rust-lang/rust/issues/116907> for more information
18+
= help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable
19+
= note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date
20+
21+
error[E0658]: mixed utf8 b"" and br"" literals are experimental
22+
--> $DIR/feature-gate-mixed-utf8-literals.rs:4:9
23+
|
24+
LL | _ = b"a\u{a5}\u{1f980}";
25+
| ^^^^^^^^^^^^^^^^^^^
26+
|
27+
= note: see issue #116907 <https://github.com/rust-lang/rust/issues/116907> for more information
28+
= help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable
29+
= note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date
30+
31+
error: aborting due to 3 previous errors
32+
33+
For more information about this error, try `rustc --explain E0658`.

‎tests/ui/mixed-utf8-literals/basic.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// check-pass
2+
3+
#![feature(mixed_utf8_literals)]
4+
5+
fn main() {
6+
b"a¥🦀";
7+
b"é";
8+
b"字";
9+
10+
br"a¥🦀";
11+
br"é";
12+
br##"é"##;
13+
14+
b"\u{a66e}";
15+
b"a\u{a5}\u{1f980}";
16+
b"\u{a4a4}";
17+
18+
b"hello\xff我叫\u{1F980}";
19+
}

‎tests/ui/parser/byte-string-literals.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,5 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape
33
pub fn main() {
44
b"\f"; //~ ERROR unknown byte escape
55
b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z`
6-
b"é"; //~ ERROR non-ASCII character in byte string literal
7-
br##"é"##; //~ ERROR non-ASCII character in raw byte string literal
86
b"a //~ ERROR unterminated double quote byte string
97
}

‎tests/ui/parser/byte-string-literals.stderr

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,14 @@ error: invalid character in numeric character escape: `Z`
2020
LL | b"\x0Z";
2121
| ^ invalid character in numeric character escape
2222

23-
error: non-ASCII character in byte string literal
24-
--> $DIR/byte-string-literals.rs:6:7
25-
|
26-
LL | b"é";
27-
| ^ must be ASCII
28-
|
29-
help: if you meant to use the unicode code point for 'é', use a \xHH escape
30-
|
31-
LL | b"\xE9";
32-
| ~~~~
33-
34-
error: non-ASCII character in raw byte string literal
35-
--> $DIR/byte-string-literals.rs:7:10
36-
|
37-
LL | br##"é"##;
38-
| ^ must be ASCII
39-
4023
error[E0766]: unterminated double quote byte string
41-
--> $DIR/byte-string-literals.rs:8:6
24+
--> $DIR/byte-string-literals.rs:6:6
4225
|
4326
LL | b"a
4427
| ______^
4528
LL | | }
4629
| |__^
4730

48-
error: aborting due to 6 previous errors
31+
error: aborting due to 4 previous errors
4932

5033
For more information about this error, try `rustc --explain E0766`.

‎tests/ui/parser/issues/issue-23620-invalid-escapes.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
11
fn main() {
2-
let _ = b"\u{a66e}";
3-
//~^ ERROR unicode escape in byte string
4-
52
let _ = b'\u{a66e}';
63
//~^ ERROR unicode escape in byte string
74

@@ -20,10 +17,9 @@ fn main() {
2017
let _ = '\xxy';
2118
//~^ ERROR invalid character in numeric character escape: `x`
2219

23-
let _ = b"\u{a4a4} \xf \u";
24-
//~^ ERROR unicode escape in byte string
25-
//~^^ ERROR invalid character in numeric character escape: ` `
26-
//~^^^ ERROR incorrect unicode escape sequence
20+
let _ = b"\xf \u";
21+
//~^ ERROR invalid character in numeric character escape: ` `
22+
//~^^ ERROR incorrect unicode escape sequence
2723

2824
let _ = "\xf \u";
2925
//~^ ERROR invalid character in numeric character escape: ` `
Lines changed: 15 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,78 @@
11
error: unicode escape in byte string
22
--> $DIR/issue-23620-invalid-escapes.rs:2:15
33
|
4-
LL | let _ = b"\u{a66e}";
5-
| ^^^^^^^^ unicode escape in byte string
6-
|
7-
= help: unicode escape sequences cannot be used as a byte or in a byte string
8-
9-
error: unicode escape in byte string
10-
--> $DIR/issue-23620-invalid-escapes.rs:5:15
11-
|
124
LL | let _ = b'\u{a66e}';
135
| ^^^^^^^^ unicode escape in byte string
146
|
157
= help: unicode escape sequences cannot be used as a byte or in a byte string
168

179
error: incorrect unicode escape sequence
18-
--> $DIR/issue-23620-invalid-escapes.rs:8:15
10+
--> $DIR/issue-23620-invalid-escapes.rs:5:15
1911
|
2012
LL | let _ = b'\u';
2113
| ^^ incorrect unicode escape sequence
2214
|
2315
= help: format of unicode escape sequences is `\u{...}`
2416

2517
error: numeric character escape is too short
26-
--> $DIR/issue-23620-invalid-escapes.rs:11:15
18+
--> $DIR/issue-23620-invalid-escapes.rs:8:15
2719
|
2820
LL | let _ = b'\x5';
2921
| ^^^
3022

3123
error: invalid character in numeric character escape: `x`
32-
--> $DIR/issue-23620-invalid-escapes.rs:14:17
24+
--> $DIR/issue-23620-invalid-escapes.rs:11:17
3325
|
3426
LL | let _ = b'\xxy';
3527
| ^ invalid character in numeric character escape
3628

3729
error: numeric character escape is too short
38-
--> $DIR/issue-23620-invalid-escapes.rs:17:14
30+
--> $DIR/issue-23620-invalid-escapes.rs:14:14
3931
|
4032
LL | let _ = '\x5';
4133
| ^^^
4234

4335
error: invalid character in numeric character escape: `x`
44-
--> $DIR/issue-23620-invalid-escapes.rs:20:16
36+
--> $DIR/issue-23620-invalid-escapes.rs:17:16
4537
|
4638
LL | let _ = '\xxy';
4739
| ^ invalid character in numeric character escape
4840

49-
error: unicode escape in byte string
50-
--> $DIR/issue-23620-invalid-escapes.rs:23:15
51-
|
52-
LL | let _ = b"\u{a4a4} \xf \u";
53-
| ^^^^^^^^ unicode escape in byte string
54-
|
55-
= help: unicode escape sequences cannot be used as a byte or in a byte string
56-
5741
error: invalid character in numeric character escape: ` `
58-
--> $DIR/issue-23620-invalid-escapes.rs:23:27
42+
--> $DIR/issue-23620-invalid-escapes.rs:20:18
5943
|
60-
LL | let _ = b"\u{a4a4} \xf \u";
61-
| ^ invalid character in numeric character escape
44+
LL | let _ = b"\xf \u";
45+
| ^ invalid character in numeric character escape
6246

6347
error: incorrect unicode escape sequence
64-
--> $DIR/issue-23620-invalid-escapes.rs:23:28
48+
--> $DIR/issue-23620-invalid-escapes.rs:20:19
6549
|
66-
LL | let _ = b"\u{a4a4} \xf \u";
67-
| ^^ incorrect unicode escape sequence
50+
LL | let _ = b"\xf \u";
51+
| ^^ incorrect unicode escape sequence
6852
|
6953
= help: format of unicode escape sequences is `\u{...}`
7054

7155
error: invalid character in numeric character escape: ` `
72-
--> $DIR/issue-23620-invalid-escapes.rs:28:17
56+
--> $DIR/issue-23620-invalid-escapes.rs:24:17
7357
|
7458
LL | let _ = "\xf \u";
7559
| ^ invalid character in numeric character escape
7660

7761
error: incorrect unicode escape sequence
78-
--> $DIR/issue-23620-invalid-escapes.rs:28:18
62+
--> $DIR/issue-23620-invalid-escapes.rs:24:18
7963
|
8064
LL | let _ = "\xf \u";
8165
| ^^ incorrect unicode escape sequence
8266
|
8367
= help: format of unicode escape sequences is `\u{...}`
8468

8569
error: incorrect unicode escape sequence
86-
--> $DIR/issue-23620-invalid-escapes.rs:32:14
70+
--> $DIR/issue-23620-invalid-escapes.rs:28:14
8771
|
8872
LL | let _ = "\u8f";
8973
| ^^^-
9074
| |
9175
| help: format of unicode escape sequences uses braces: `\u{8f}`
9276

93-
error: aborting due to 13 previous errors
77+
error: aborting due to 11 previous errors
9478

‎tests/ui/parser/raw/raw-byte-string-literals.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,5 @@
22

33
pub fn main() {
44
br"a"; //~ ERROR bare CR not allowed in raw string
5-
br"é"; //~ ERROR non-ASCII character in raw byte string literal
65
br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation
76
}

‎tests/ui/parser/raw/raw-byte-string-literals.stderr

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,11 @@ error: bare CR not allowed in raw string
44
LL | br"a";
55
| ^
66

7-
error: non-ASCII character in raw byte string literal
8-
--> $DIR/raw-byte-string-literals.rs:5:8
9-
|
10-
LL | br"é";
11-
| ^ must be ASCII
12-
137
error: found invalid character; only `#` is allowed in raw string delimitation: ~
14-
--> $DIR/raw-byte-string-literals.rs:6:5
8+
--> $DIR/raw-byte-string-literals.rs:5:5
159
|
1610
LL | br##~"a"~##;
1711
| ^^^^^
1812

19-
error: aborting due to 3 previous errors
13+
error: aborting due to 2 previous errors
2014

‎tests/ui/parser/unicode-control-codepoints.rs

Lines changed: 3 additions & 10 deletions
This file contains bidirectional or hidden Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ fn main() {
44
println!("us\u{202B}e\u{202A}r");
55
println!("{:?}", r#"us\u{202B}e\u{202A}r"#);
66
println!("{:?}", b"us\u{202B}e\u{202A}r");
7-
//~^ ERROR unicode escape in byte string
8-
//~| ERROR unicode escape in byte string
7+
//~^ ERROR mixed utf8 b"" and br"" literals are experimental
98
println!("{:?}", br##"us\u{202B}e\u{202A}r"##);
109

1110
println!("{:?}", "/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only ");
@@ -14,15 +13,9 @@ fn main() {
1413
println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##);
1514
//~^ ERROR unicode codepoint changing visible direction of text present in literal
1615
println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only ");
17-
//~^ ERROR non-ASCII character in byte string literal
18-
//~| ERROR non-ASCII character in byte string literal
19-
//~| ERROR non-ASCII character in byte string literal
20-
//~| ERROR non-ASCII character in byte string literal
16+
//~^ ERROR mixed utf8 b"" and br"" literals are experimental
2117
println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##);
22-
//~^ ERROR non-ASCII character in raw byte string literal
23-
//~| ERROR non-ASCII character in raw byte string literal
24-
//~| ERROR non-ASCII character in raw byte string literal
25-
//~| ERROR non-ASCII character in raw byte string literal
18+
//~^ ERROR mixed utf8 b"" and br"" literals are experimental
2619
println!("{:?}", '‮');
2720
//~^ ERROR unicode codepoint changing visible direction of text present in literal
2821
}
Lines changed: 26 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,32 @@
1-
error: unicode escape in byte string
2-
--> $DIR/unicode-control-codepoints.rs:6:26
1+
error[E0658]: mixed utf8 b"" and br"" literals are experimental
2+
--> $DIR/unicode-control-codepoints.rs:6:22
33
|
44
LL | println!("{:?}", b"us\u{202B}e\u{202A}r");
5-
| ^^^^^^^^ unicode escape in byte string
5+
| ^^^^^^^^^^^^^^^^^^^^^^^
66
|
7-
= help: unicode escape sequences cannot be used as a byte or in a byte string
7+
= note: see issue #116907 <https://github.com/rust-lang/rust/issues/116907> for more information
8+
= help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable
9+
= note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date
810

9-
error: unicode escape in byte string
10-
--> $DIR/unicode-control-codepoints.rs:6:35
11-
|
12-
LL | println!("{:?}", b"us\u{202B}e\u{202A}r");
13-
| ^^^^^^^^ unicode escape in byte string
14-
|
15-
= help: unicode escape sequences cannot be used as a byte or in a byte string
16-
17-
error: non-ASCII character in byte string literal
18-
--> $DIR/unicode-control-codepoints.rs:16:26
19-
|
20-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
21-
| ^ must be ASCII but is '\u{202e}'
22-
|
23-
help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
24-
|
25-
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only ");
26-
| ~~~~~~~~~~~~
27-
28-
error: non-ASCII character in byte string literal
29-
--> $DIR/unicode-control-codepoints.rs:16:30
30-
|
31-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
32-
| ^ must be ASCII but is '\u{2066}'
33-
|
34-
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
35-
|
36-
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only ");
37-
| ~~~~~~~~~~~~
38-
39-
error: non-ASCII character in byte string literal
40-
--> $DIR/unicode-control-codepoints.rs:16:41
11+
error[E0658]: mixed utf8 b"" and br"" literals are experimental
12+
--> $DIR/unicode-control-codepoints.rs:15:22
4113
|
4214
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
43-
| ^ must be ASCII but is '\u{2069}'
15+
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4416
|
45-
help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
46-
|
47-
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
48-
| ~~~~~~~~~~~~
49-
50-
error: non-ASCII character in byte string literal
51-
--> $DIR/unicode-control-codepoints.rs:16:43
52-
|
53-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
54-
| ^ must be ASCII but is '\u{2066}'
55-
|
56-
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
57-
|
58-
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
59-
| ~~~~~~~~~~~~
60-
61-
error: non-ASCII character in raw byte string literal
62-
--> $DIR/unicode-control-codepoints.rs:21:29
63-
|
64-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
65-
| ^ must be ASCII but is '\u{202e}'
17+
= note: see issue #116907 <https://github.com/rust-lang/rust/issues/116907> for more information
18+
= help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable
19+
= note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date
6620

67-
error: non-ASCII character in raw byte string literal
68-
--> $DIR/unicode-control-codepoints.rs:21:33
21+
error[E0658]: mixed utf8 b"" and br"" literals are experimental
22+
--> $DIR/unicode-control-codepoints.rs:17:22
6923
|
7024
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
71-
| ^ must be ASCII but is '\u{2066}'
72-
73-
error: non-ASCII character in raw byte string literal
74-
--> $DIR/unicode-control-codepoints.rs:21:44
25+
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
7526
|
76-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
77-
| ^ must be ASCII but is '\u{2069}'
78-
79-
error: non-ASCII character in raw byte string literal
80-
--> $DIR/unicode-control-codepoints.rs:21:46
81-
|
82-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
83-
| ^ must be ASCII but is '\u{2066}'
27+
= note: see issue #116907 <https://github.com/rust-lang/rust/issues/116907> for more information
28+
= help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable
29+
= note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date
8430

8531
error: unicode codepoint changing visible direction of text present in comment
8632
--> $DIR/unicode-control-codepoints.rs:2:5
@@ -97,7 +43,7 @@ LL | // if access_level != "user" { // Check if admin
9743
= help: if their presence wasn't intentional, you can remove them
9844

9945
error: unicode codepoint changing visible direction of text present in comment
100-
--> $DIR/unicode-control-codepoints.rs:30:1
46+
--> $DIR/unicode-control-codepoints.rs:23:1
10147
|
10248
LL | //"/* } if isAdmin begin admins only */"
10349
| ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
@@ -112,7 +58,7 @@ LL | //"/* } if isAdmin begin admins only */"
11258
= help: if their presence wasn't intentional, you can remove them
11359

11460
error: unicode codepoint changing visible direction of text present in literal
115-
--> $DIR/unicode-control-codepoints.rs:11:22
61+
--> $DIR/unicode-control-codepoints.rs:10:22
11662
|
11763
LL | println!("{:?}", "/* } if isAdmin begin admins only ");
11864
| ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^
@@ -132,7 +78,7 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi
13278
| ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~
13379

13480
error: unicode codepoint changing visible direction of text present in literal
135-
--> $DIR/unicode-control-codepoints.rs:14:22
81+
--> $DIR/unicode-control-codepoints.rs:13:22
13682
|
13783
LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##);
13884
| ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
@@ -151,7 +97,7 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b
15197
| ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~
15298

15399
error: unicode codepoint changing visible direction of text present in literal
154-
--> $DIR/unicode-control-codepoints.rs:26:22
100+
--> $DIR/unicode-control-codepoints.rs:19:22
155101
|
156102
LL | println!("{:?}", '');
157103
| ^-
@@ -167,7 +113,7 @@ LL | println!("{:?}", '\u{202e}');
167113
| ~~~~~~~~
168114

169115
error: unicode codepoint changing visible direction of text present in doc comment
170-
--> $DIR/unicode-control-codepoints.rs:33:1
116+
--> $DIR/unicode-control-codepoints.rs:26:1
171117
|
172118
LL | /** ''); */fn foo() {}
173119
| ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
@@ -177,7 +123,7 @@ LL | /** ''); */fn foo() {}
177123
= note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}'
178124

179125
error: unicode codepoint changing visible direction of text present in doc comment
180-
--> $DIR/unicode-control-codepoints.rs:36:1
126+
--> $DIR/unicode-control-codepoints.rs:29:1
181127
|
182128
LL | / /**
183129
LL | | *
@@ -188,5 +134,6 @@ LL | | * ''); */fn bar() {}
188134
= note: if their presence wasn't intentional, you can remove them
189135
= note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}'
190136

191-
error: aborting due to 17 previous errors
137+
error: aborting due to 10 previous errors
192138

139+
For more information about this error, try `rustc --explain E0658`.

‎tests/ui/suggestions/multibyte-escapes.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,4 @@ fn main() {
1010
//~^ ERROR: non-ASCII character in byte literal
1111
//~| NOTE: this multibyte character does not fit into a single byte
1212
//~| NOTE: must be ASCII
13-
14-
b"字";
15-
//~^ ERROR: non-ASCII character in byte string literal
16-
//~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
17-
//~| NOTE: must be ASCII
1813
}

‎tests/ui/suggestions/multibyte-escapes.stderr

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,5 @@ LL | b'字';
1818
| must be ASCII
1919
| this multibyte character does not fit into a single byte
2020

21-
error: non-ASCII character in byte string literal
22-
--> $DIR/multibyte-escapes.rs:14:7
23-
|
24-
LL | b"字";
25-
| ^^ must be ASCII
26-
|
27-
help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
28-
|
29-
LL | b"\xE5\xAD\x97";
30-
| ~~~~~~~~~~~~
31-
32-
error: aborting due to 3 previous errors
21+
error: aborting due to 2 previous errors
3322

0 commit comments

Comments
 (0)
Please sign in to comment.