Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for source code with invalid utf-8 bytes #135854

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler/rustc_builtin_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#![feature(proc_macro_internals)]
#![feature(proc_macro_quote)]
#![feature(rustdoc_internals)]
#![feature(string_from_utf8_lossy_owned)]
#![feature(try_blocks)]
#![warn(unreachable_pub)]
// tidy-alphabetical-end
Expand Down
11 changes: 6 additions & 5 deletions compiler/rustc_builtin_macros/src/source_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use rustc_expand::base::{
use rustc_expand::module::DirOwnership;
use rustc_lint_defs::BuiltinLintDiag;
use rustc_parse::parser::{ForceCollect, Parser};
use rustc_parse::{new_parser_from_file, unwrap_or_emit_fatal};
use rustc_parse::{new_parser_from_file, unwrap_or_emit_fatal, utf8_error};
use rustc_session::lint::builtin::INCOMPLETE_INCLUDE;
use rustc_span::source_map::SourceMap;
use rustc_span::{Pos, Span, Symbol};
Expand Down Expand Up @@ -209,9 +209,10 @@ pub(crate) fn expand_include_str(
let interned_src = Symbol::intern(src);
MacEager::expr(cx.expr_str(cx.with_def_site_ctxt(bsp), interned_src))
}
Err(_) => {
let guar = cx.dcx().span_err(sp, format!("`{path}` wasn't a utf-8 file"));
DummyResult::any(sp, guar)
Err(utf8err) => {
let mut err = cx.dcx().struct_span_err(sp, format!("`{path}` wasn't a utf-8 file"));
utf8_error(cx.source_map(), path.as_str(), None, &mut err, utf8err, &bytes[..]);
DummyResult::any(sp, err.emit())
}
},
Err(dummy) => dummy,
Expand Down Expand Up @@ -273,7 +274,7 @@ fn load_binary_file(
.and_then(|path| path.into_os_string().into_string().ok());

if let Some(new_path) = new_path {
err.span_suggestion(
err.span_suggestion_verbose(
path_span,
"there is a file with the same name in a different directory",
format!("\"{}\"", new_path.replace('\\', "/").escape_debug()),
Expand Down
67 changes: 63 additions & 4 deletions compiler/rustc_parse/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,21 @@
#![feature(if_let_guard)]
#![feature(iter_intersperse)]
#![feature(let_chains)]
#![feature(string_from_utf8_lossy_owned)]
#![warn(unreachable_pub)]
// tidy-alphabetical-end

use std::path::Path;
use std::path::{Path, PathBuf};
use std::str::Utf8Error;

use rustc_ast as ast;
use rustc_ast::tokenstream::TokenStream;
use rustc_ast::{AttrItem, Attribute, MetaItemInner, token};
use rustc_ast_pretty::pprust;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{Diag, FatalError, PResult};
use rustc_errors::{Diag, EmissionGuarantee, FatalError, PResult, pluralize};
use rustc_session::parse::ParseSess;
use rustc_span::source_map::SourceMap;
use rustc_span::{FileName, SourceFile, Span};
pub use unicode_normalization::UNICODE_VERSION as UNICODE_NORMALIZATION_VERSION;

Expand Down Expand Up @@ -73,9 +76,22 @@ pub fn new_parser_from_file<'a>(
path: &Path,
sp: Option<Span>,
) -> Result<Parser<'a>, Vec<Diag<'a>>> {
let source_file = psess.source_map().load_file(path).unwrap_or_else(|e| {
let msg = format!("couldn't read {}: {}", path.display(), e);
let sm = psess.source_map();
let source_file = sm.load_file(path).unwrap_or_else(|e| {
let msg = format!("couldn't read `{}`: {}", path.display(), e);
let mut err = psess.dcx().struct_fatal(msg);
if let Ok(contents) = std::fs::read(path)
&& let Err(utf8err) = String::from_utf8(contents.clone())
{
utf8_error(
sm,
&path.display().to_string(),
sp,
&mut err,
utf8err.utf8_error(),
&contents,
);
}
if let Some(sp) = sp {
err.span(sp);
}
Expand All @@ -84,6 +100,49 @@ pub fn new_parser_from_file<'a>(
new_parser_from_source_file(psess, source_file)
}

pub fn utf8_error<E: EmissionGuarantee>(
sm: &SourceMap,
path: &str,
sp: Option<Span>,
err: &mut Diag<'_, E>,
utf8err: Utf8Error,
contents: &[u8],
) {
// The file exists, but it wasn't valid UTF-8.
let start = utf8err.valid_up_to();
let note = format!("invalid utf-8 at byte `{start}`");
let msg = if let Some(len) = utf8err.error_len() {
format!(
"byte{s} `{bytes}` {are} not valid utf-8",
bytes = if len == 1 {
format!("{:?}", contents[start])
} else {
format!("{:?}", &contents[start..start + len])
},
s = pluralize!(len),
are = if len == 1 { "is" } else { "are" },
)
} else {
note.clone()
};
let contents = String::from_utf8_lossy(contents).to_string();
let source = sm.new_source_file(PathBuf::from(path).into(), contents);
let span = Span::with_root_ctxt(
source.normalized_byte_pos(start as u32),
source.normalized_byte_pos(start as u32),
);
if span.is_dummy() {
err.note(note);
} else {
if sp.is_some() {
err.span_note(span, msg);
} else {
err.span(span);
err.span_label(span, msg);
}
}
}

/// Given a session and a `source_file`, return a parser. Returns any buffered errors from lexing
/// the initial token stream.
fn new_parser_from_source_file(
Expand Down
2 changes: 2 additions & 0 deletions src/tools/compiletest/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ pub fn load_errors(testfile: &Path, revision: Option<&str>) -> Vec<Error> {

rdr.lines()
.enumerate()
// We want to ignore utf-8 failures in tests during collection of annotations.
.filter(|(_, line)| line.is_ok())
.filter_map(|(line_num, line)| {
parse_expected(last_nonfollow_error, line_num + 1, &line.unwrap(), revision).map(
|(which, error)| {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ pub fn check(tests_path: impl AsRef<Path>, bad: &mut bool) {

let mut expected_revisions = BTreeSet::new();

let contents = std::fs::read_to_string(test).unwrap();
let Ok(contents) = std::fs::read_to_string(test) else { continue };

// Collect directives.
iter_header(&contents, &mut |HeaderLine { revision, directive, .. }| {
Expand Down
7 changes: 7 additions & 0 deletions tests/ui/macros/not-utf8-2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
//@ error-pattern: did not contain valid UTF-8
//@ reference: input.encoding.utf8
//@ reference: input.encoding.invalid

fn foo() {
include!("not-utf8-bin-file.rs");
}
15 changes: 15 additions & 0 deletions tests/ui/macros/not-utf8-2.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
error: couldn't read `$DIR/not-utf8-bin-file.rs`: stream did not contain valid UTF-8
--> $DIR/not-utf8-2.rs:6:5
|
LL | include!("not-utf8-bin-file.rs");
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
note: byte `193` is not valid utf-8
--> $DIR/not-utf8-bin-file.rs:2:14
|
LL | let _ = "�|�␂!5�cc␕␂��";
| ^
= note: this error originates in the macro `include` (in Nightly builds, run with -Z macro-backtrace for more info)

error: aborting due to 1 previous error

4 changes: 4 additions & 0 deletions tests/ui/macros/not-utf8-bin-file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
fn main() {
let _ = "Á|Õ!5¢ccŒÓ";
//~^ ERROR stream did not contain valid UTF-8
}
8 changes: 8 additions & 0 deletions tests/ui/macros/not-utf8-bin-file.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
error: couldn't read `$DIR/not-utf8-bin-file.rs`: stream did not contain valid UTF-8
--> $DIR/not-utf8-bin-file.rs:2:14
|
LL | let _ = "�|�␂!5�cc␕␂��";
| ^ byte `193` is not valid utf-8

error: aborting due to 1 previous error

2 changes: 1 addition & 1 deletion tests/ui/macros/not-utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
//@ reference: input.encoding.invalid

fn foo() {
include!("not-utf8.bin")
include!("not-utf8.bin");
}
9 changes: 7 additions & 2 deletions tests/ui/macros/not-utf8.stderr
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
error: couldn't read $DIR/not-utf8.bin: stream did not contain valid UTF-8
error: couldn't read `$DIR/not-utf8.bin`: stream did not contain valid UTF-8
--> $DIR/not-utf8.rs:6:5
|
LL | include!("not-utf8.bin")
LL | include!("not-utf8.bin");
| ^^^^^^^^^^^^^^^^^^^^^^^^
|
note: byte `193` is not valid utf-8
--> $DIR/not-utf8.bin:1:1
|
LL | �|�␂!5�cc␕␂�Ӻi��WWj�ȥ�'�}�␒�J�ȉ��W�␞O�@����␜w�V���LO����␔[ ␃_�'���SQ�~ذ��ų&��- ��lN~��!@␌ _#���kQ��h�␝�:�...
| ^
= note: this error originates in the macro `include` (in Nightly builds, run with -Z macro-backtrace for more info)

error: aborting due to 1 previous error
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/modules/path-no-file-name.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//@ normalize-stderr: "\.:.*\(" -> ".: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "\.`:.*\(" -> ".`: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "os error \d+" -> "os error $$ACCESS_DENIED_CODE"

#[path = "."]
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/modules/path-no-file-name.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/.: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
error: couldn't read `$DIR/.`: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
--> $DIR/path-no-file-name.rs:5:1
|
LL | mod m;
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/parser/issues/issue-5806.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//@ normalize-stderr: "parser:.*\(" -> "parser: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "parser`:.*\(" -> "parser`: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "os error \d+" -> "os error $$ACCESS_DENIED_CODE"

#[path = "../parser"]
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/parser/issues/issue-5806.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/../parser: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
error: couldn't read `$DIR/../parser`: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
--> $DIR/issue-5806.rs:5:1
|
LL | mod foo;
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/parser/mod_file_with_path_attr.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//@ normalize-stderr: "not_a_real_file.rs:.*\(" -> "not_a_real_file.rs: $$FILE_NOT_FOUND_MSG ("
//@ normalize-stderr: "not_a_real_file.rs`:.*\(" -> "not_a_real_file.rs`: $$FILE_NOT_FOUND_MSG ("

#[path = "not_a_real_file.rs"]
mod m; //~ ERROR not_a_real_file.rs
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/parser/mod_file_with_path_attr.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/not_a_real_file.rs: $FILE_NOT_FOUND_MSG (os error 2)
error: couldn't read `$DIR/not_a_real_file.rs`: $FILE_NOT_FOUND_MSG (os error 2)
--> $DIR/mod_file_with_path_attr.rs:4:1
|
LL | mod m;
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/unpretty/staged-api-invalid-path-108697.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/lol: No such file or directory (os error 2)
error: couldn't read `$DIR/lol`: No such file or directory (os error 2)
--> $DIR/staged-api-invalid-path-108697.rs:8:1
|
LL | mod foo;
Expand Down
Loading