-
Notifications
You must be signed in to change notification settings - Fork 15.7k
Open
Labels
Description
After commit 4109bac3301e ("[IR] Do not store Function inside BlockAddress (#137958)"), I am seeing a (leftover?) dead jump in radeon_bo_list_validate() when building the Linux kernel, which causes the kernel's objtool tool to complain about an unreachable instruction.
Full radeon_object.i
$ install/llvm-good/bin/clang --version | head -1
ClangBuiltLinux clang version 21.0.0git (https://github.com/llvm/llvm-project.git ff28e1a5a92da380c2869aba09971687c26d2f0f)
$ install/llvm-good/bin/clang --target=x86_64-linux-gnu -O2 -w -c -o good-radeon_object.o radeon_object.i
$ build/linux/good/tools/objtool/objtool --uaccess good-radeon_object.o
$ install/llvm-bad/bin/clang --version | head -1
ClangBuiltLinux clang version 21.0.0git (https://github.com/llvm/llvm-project.git 4109bac3301eb7b7033eec3c8e8107be8cad9bc9)
$ install/llvm-bad/bin/clang --target=x86_64-linux-gnu -O2 -w -c -o bad-radeon_object.o radeon_object.i
$ build/linux/good/tools/objtool/objtool --uaccess bad-radeon_object.o
bad-radeon_object.o: warning: objtool: radeon_bo_list_validate+0x228: unreachable instruction
The diff of llvm-objdump -dr --disassemble-symbols=radeon_bo_list_validate between the good and bad object files:
diff --git a/tmp/.psub.1YIAH3 b/tmp/.psub.4rVRuG
index 2773db7..b49bcc1 100644
--- a/tmp/.psub.1YIAH3
+++ b/tmp/.psub.4rVRuG
@@ -1,5 +1,5 @@
-good-radeon_object.o: file format elf64-x86-64
+bad-radeon_object.o: file format elf64-x86-64
Disassembly of section .text:
@@ -157,4 +157,5 @@ Disassembly of section .text:
ce4: 41 5f popq %r15
ce6: 5d popq %rbp
ce7: c3 retq
- ce8: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax)
+ ce8: e9 41 fe ff ff jmp 0xb2e <radeon_bo_list_validate+0x6e>
+ ced: 0f 1f 00 nopl (%rax)
Although the IR does not appear to be any different?
diff --git a/build/linux/good/drivers/gpu/drm/radeon/radeon_object.ll b/build/linux/bad/drivers/gpu/drm/radeon/radeon_object.ll
index cee5120..5ed5015 100644
--- a/build/linux/good/drivers/gpu/drm/radeon/radeon_object.ll
+++ b/build/linux/bad/drivers/gpu/drm/radeon/radeon_object.ll
@@ -1111,7 +1111,7 @@ for.cond3.preheader.lr.ph.lr.ph: ; preds = %entry
%contended.i = getelementptr i8, ptr %exec, i64 48
br label %for.cond3.preheader
-__drm_exec_476.loopexit: ; preds = %for.body5
+__drm_exec_476: ; preds = %for.body5
%call1147 = tail call zeroext i1 @drm_exec_cleanup(ptr noundef %exec) #11
br i1 %call1147, label %for.cond3.preheader.backedge, label %for.cond32.preheader
@@ -1119,7 +1119,7 @@ for.cond.loopexit: ; preds = %for.inc, %for.cond3
%call1 = tail call zeroext i1 @drm_exec_cleanup(ptr noundef %exec) #11
br i1 %call1, label %for.cond3.preheader.backedge, label %for.cond32.preheader
-for.cond3.preheader.backedge: ; preds = %for.cond.loopexit, %__drm_exec_476.loopexit
+for.cond3.preheader.backedge: ; preds = %for.cond.loopexit, %__drm_exec_476
br label %for.cond3.preheader, !llvm.loop !17
for.cond3.preheader: ; preds = %for.cond3.preheader.backedge, %for.cond3.preheader.lr.ph.lr.ph
@@ -1127,7 +1127,7 @@ for.cond3.preheader: ; preds = %for.cond3.preheader
%cmp.i.not145 = icmp eq ptr %.pn129144, %head
br i1 %cmp.i.not145, label %for.cond.loopexit, label %for.body5
-for.cond32.preheader: ; preds = %__drm_exec_476.loopexit, %for.cond.loopexit, %entry
+for.cond32.preheader: ; preds = %__drm_exec_476, %for.cond.loopexit, %entry
%.pn149 = load ptr, ptr %head, align 8
%cmp.i136.not150 = icmp eq ptr %.pn149, %head
br i1 %cmp.i136.not150, label %return, label %for.body38.lr.ph
@@ -1145,7 +1145,7 @@ for.body5: ; preds = %for.cond3.preheader
%call6 = tail call i32 @drm_exec_prepare_obj(ptr noundef %exec, ptr noundef %tbo, i32 noundef 1) #11
%3 = load ptr, ptr %contended.i, align 8
%tobool.i.not = icmp eq ptr %3, null
- br i1 %tobool.i.not, label %do.end, label %__drm_exec_476.loopexit, !prof !5
+ br i1 %tobool.i.not, label %do.end, label %__drm_exec_476, !prof !5
do.end: ; preds = %for.body5
switch i32 %call6, label %return [
@@ -1941,7 +1941,7 @@ attributes #15 = { cold noredzone nounwind "no-builtin-wcslen" }
!1 = !{i32 1, !"Code Model", i32 2}
!2 = !{i32 1, !"override-stack-alignment", i32 8}
!3 = !{i32 4, !"SkipRaxSetup", i32 1}
-!4 = !{!"ClangBuiltLinux clang version 21.0.0git (https://github.com/llvm/llvm-project.git ff28e1a5a92da380c2869aba09971687c26d2f0f)"}
+!4 = !{!"ClangBuiltLinux clang version 21.0.0git (https://github.com/llvm/llvm-project.git 4109bac3301eb7b7033eec3c8e8107be8cad9bc9)"}
!5 = !{!"branch_weights", !"expected", i32 2000, i32 1}
!6 = !{i64 2158211981, i64 2158212007, i64 2158212327, i64 2158212382, i64 2158212434, i64 2158212486, i64 2158212087, i64 2158212119, i64 2158212193, i64 2158212248, i64 2158212273, i64 2158212298, i64 2158212322}
!7 = distinct !{!7, !8}
I was able to reduce the following out with cvise:
typedef long u64;
struct ttm_placement {};
struct ttm_buffer_object {};
int *radeon_bo_list_validate_exec;
struct {
struct ttm_placement placement;
struct ttm_buffer_object tbo;
} radeon_bo_list_validate_bo;
_Bool radeon_bo_list_validate___trans_tmp_1;
u64 radeon_bo_list_validate_bytes_moved = 0,
radeon_bo_list_validate_initial_bytes_moved,
radeon_bo_list_validate_bytes_moved_threshold = 0;
struct {
;
unsigned preferred_domains;
unsigned allowed_domains;
} *radeon_bo_list_validate_lobj;
int radeon_bo_list_validate_ctx, radeon_bo_list_validate_r;
int ttm_bo_validate(struct ttm_buffer_object *, struct ttm_placement *, int *);
_Bool drm_exec_is_contended(int *);
void radeon_bo_list_validate() {
__drm_exec_476:
for (void *__drm_exec_retry_ptr; ({
__drm_exec_retry_ptr = &&__drm_exec_476;
radeon_bo_list_validate_exec;
});)
for (;;)
do {
radeon_bo_list_validate___trans_tmp_1 =
drm_exec_is_contended(radeon_bo_list_validate_exec);
if (__builtin_expect(radeon_bo_list_validate___trans_tmp_1, 0))
goto *__drm_exec_retry_ptr;
} while (0);
for (radeon_bo_list_validate_lobj = 0;;) {
int domain = radeon_bo_list_validate_lobj->preferred_domains,
allowed = radeon_bo_list_validate_lobj->allowed_domains,
current_domain = 0;
if (allowed & current_domain &&
radeon_bo_list_validate_bytes_moved >
radeon_bo_list_validate_bytes_moved_threshold)
retry:
radeon_bo_list_validate_initial_bytes_moved = radeon_bo_list_validate_r =
ttm_bo_validate(&radeon_bo_list_validate_bo.tbo,
&radeon_bo_list_validate_bo.placement,
&radeon_bo_list_validate_ctx);
radeon_bo_list_validate_bytes_moved +=
radeon_bo_list_validate_initial_bytes_moved;
if (domain)
goto retry;
}
}which exhibits the same behavior as above:
$ install/llvm-good/bin/clang --target=x86_64-linux-gnu -O2 -w -c -o good-radeon_object.o radeon_object.i
$ install/llvm-bad/bin/clang --target=x86_64-linux-gnu -O2 -w -c -o bad-radeon_object.o radeon_object.i
$ build/linux/good/tools/objtool/objtool --uaccess good-radeon_object.o
$ build/linux/good/tools/objtool/objtool --uaccess bad-radeon_object.o
bad-radeon_object.o: warning: objtool: radeon_bo_list_validate+0x62: unreachable instruction
diff --git a/tmp/.psub.8s3ccz b/tmp/.psub.fPWVlL
index 9684da6..10aab15 100644
--- a/tmp/.psub.8s3ccz
+++ b/tmp/.psub.fPWVlL
@@ -1,5 +1,5 @@
-good-radeon_object.o: file format elf64-x86-64
+bad-radeon_object.o: file format elf64-x86-64
Disassembly of section .text:
@@ -27,3 +27,4 @@ Disassembly of section .text:
000000000000004d: R_X86_64_PC32 radeon_bo_list_validate_bytes_moved-0x4
51: 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax)
60: eb fe jmp 0x60 <radeon_bo_list_validate+0x60>
+ 62: eb ac jmp 0x10 <radeon_bo_list_validate+0x10>
It may be possible to workaround this in objtool but it seems suboptimal to have this dead code around?
cc @nikic