cg_llvm: use index-based loop in write_operand_repeatedly #112516

erikdesjardins · 2023-06-11T04:06:57Z

This should be easier for LLVM to analyze.

Fixes #111603

This needs a perf run.

cc @caojoshua

This is easier for LLVM to analyze.

rustbot · 2023-06-11T04:07:04Z

r? @davidtwco

(rustbot has picked a reviewer for you, use r? to override)

caojoshua · 2023-06-11T04:42:25Z

compiler/rustc_codegen_llvm/src/builder.rs

+        let dest_elem = dest.project_index(&mut body_bx, i);
+        cg_elem.val.store(&mut body_bx, dest_elem);
+
+        let next = body_bx.unchecked_uadd(i, self.const_usize(1));


should we add no-unsigned-wrap flag here? Do we let LLVM analyze it instead?

unchecked_uadd (vs. add) adds nuw

caojoshua · 2023-06-11T04:45:03Z

compiler/rustc_codegen_llvm/src/builder.rs

-            current,
-            &[self.const_usize(1)],
-        );
+        let dest_elem = dest.project_index(&mut body_bx, i);


Does project_index emit GEP? Will it emit the inbounds and align info that was previously there?

Yes:

rust/compiler/rustc_codegen_ssa/src/mir/place.rs

Lines 378 to 402 in 970058e

pub fn project_index<Bx: BuilderMethods<'a, 'tcx, Value = V>>(

&self,

bx: &mut Bx,

llindex: V,

) -> Self {

// Statically compute the offset if we can, otherwise just use the element size,

// as this will yield the lowest alignment.

let layout = self.layout.field(bx, 0);

let offset = if let Some(llindex) = bx.const_to_opt_uint(llindex) {

layout.size.checked_mul(llindex, bx).unwrap_or(layout.size)

} else {

layout.size

};

PlaceRef {

llval: bx.inbounds_gep(

bx.cx().backend_type(self.layout),

self.llval,

&[bx.cx().const_usize(0), llindex],

),

llextra: None,

layout,

align: self.align.restrict_for_offset(offset),

}

}

What was there before was basically a reimplementation of project_index (which was sort of necessary because the GEP was threaded through the phi)

caojoshua · 2023-06-11T04:47:21Z

tests/codegen/issues/issue-111603.rs

@@ -5,6 +5,18 @@

 use std::sync::Arc;

+// CHECK-LABEL: @new_from_array


Could we emit the entire IR here? I think its helpful for reviewers, test coverage, and people trying to understand the codebase better.

Because we don't have something like LLVM's update_test_checks script, it is painful to update codegen tests. So we tend to use the minimal amount of CHECK lines so we don't have to update tests due to unrelated changes. (Also, most tests don't specify a target, so they generate slightly different IR per platform due to differences in ABI/ vectorization/etc., which would have to be dealt with, if we did this in general.)

In this case, this is the full IR of new_from_array:

IR

define { ptr, i64 } @new_from_array(i64 noundef %x) unnamed_addr #0 personality ptr @rust_eh_personality { start: %array = alloca [1000 x i64], align 8 %broadcast.splatinsert = insertelement <2 x i64> poison, i64 %x, i64 0 %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 %x, i64 0 %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> poison, <2 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %start %index = phi i64 [ 0, %start ], [ %index.next.4, %vector.body ] %0 = getelementptr inbounds [1000 x i64], ptr %array, i64 0, i64 %index store <2 x i64> %broadcast.splat, ptr %0, align 8 %1 = getelementptr inbounds i64, ptr %0, i64 2 store <2 x i64> %broadcast.splat2, ptr %1, align 8 %index.next = add nuw nsw i64 %index, 4 %2 = getelementptr inbounds [1000 x i64], ptr %array, i64 0, i64 %index.next store <2 x i64> %broadcast.splat, ptr %2, align 8 %3 = getelementptr inbounds i64, ptr %2, i64 2 store <2 x i64> %broadcast.splat2, ptr %3, align 8 %index.next.1 = add nuw nsw i64 %index, 8 %4 = getelementptr inbounds [1000 x i64], ptr %array, i64 0, i64 %index.next.1 store <2 x i64> %broadcast.splat, ptr %4, align 8 %5 = getelementptr inbounds i64, ptr %4, i64 2 store <2 x i64> %broadcast.splat2, ptr %5, align 8 %index.next.2 = add nuw nsw i64 %index, 12 %6 = getelementptr inbounds [1000 x i64], ptr %array, i64 0, i64 %index.next.2 store <2 x i64> %broadcast.splat, ptr %6, align 8 %7 = getelementptr inbounds i64, ptr %6, i64 2 store <2 x i64> %broadcast.splat2, ptr %7, align 8 %index.next.3 = add nuw nsw i64 %index, 16 %8 = getelementptr inbounds [1000 x i64], ptr %array, i64 0, i64 %index.next.3 store <2 x i64> %broadcast.splat, ptr %8, align 8 %9 = getelementptr inbounds i64, ptr %8, i64 2 store <2 x i64> %broadcast.splat2, ptr %9, align 8 %index.next.4 = add nuw nsw i64 %index, 20 %10 = icmp eq i64 %index.next.4, 1000 br i1 %10, label %repeat_loop_next, label %vector.body, !llvm.loop !2 repeat_loop_next: ; preds = %vector.body %11 = load volatile i8, ptr @__rust_no_alloc_shim_is_unstable, align 1, !noalias !5 %12 = tail call noundef align 8 dereferenceable_or_null(8016) ptr @__rust_alloc(i64 noundef 8016, i64 noundef 8) #6, !noalias !5 %13 = icmp eq ptr %12, null br i1 %13, label %bb1.i.i, label %"_ZN5alloc4sync12Arc$LT$T$GT$3new17hc22c917a7edefd8bE.exit" bb1.i.i: ; preds = %repeat_loop_next ; call alloc::alloc::handle_alloc_error tail call void @_ZN5alloc5alloc18handle_alloc_error17h5a822ff2e844764dE(i64 noundef 8, i64 noundef 8016) #7, !noalias !5 unreachable "_ZN5alloc4sync12Arc$LT$T$GT$3new17hc22c917a7edefd8bE.exit": ; preds = %repeat_loop_next store i64 1, ptr %12, align 8, !noalias !5 %x.sroa.4.0._14.sroa_idx.i = getelementptr inbounds i8, ptr %12, i64 8 store i64 1, ptr %x.sroa.4.0._14.sroa_idx.i, align 8, !noalias !5 %x.sroa.5.0._14.sroa_idx.i = getelementptr inbounds i8, ptr %12, i64 16 call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(8000) %x.sroa.5.0._14.sroa_idx.i, ptr noundef nonnull align 8 dereferenceable(8000) %array, i64 8000, i1 false) %14 = insertvalue { ptr, i64 } poison, ptr %12, 0 %15 = insertvalue { ptr, i64 } %14, i64 1000, 1 ret { ptr, i64 } %15 }

Thanks for sharing IR. Looks like the redundant alloca was removed. Nice!

caojoshua · 2023-06-11T04:52:13Z

I can confirm that this approach is what I was suggesting in the issue. I have been working on various changes within LLVM to improve analysis on pointer comparisons that would have resolved this issue. However, I still think it makes sense to make this change in rustc.

I have not worked on this project and do not want to dive deep in the code base right now. Please excuse my noob questions.

caojoshua

LGTM, but of course there should be other reviewers

the8472 · 2023-06-11T05:30:29Z

@bors try @rust-timer queue

bors · 2023-06-11T05:30:39Z

⌛ Trying commit bd0aae9 with merge c4c156d920b0e876d380a9464958a5e90d2d1d48...

bors · 2023-06-11T07:06:11Z

☀️ Try build successful - checks-actions
Build commit: c4c156d920b0e876d380a9464958a5e90d2d1d48 (c4c156d920b0e876d380a9464958a5e90d2d1d48)

rust-timer · 2023-06-11T09:09:25Z

Finished benchmarking commit (c4c156d920b0e876d380a9464958a5e90d2d1d48): comparison URL.

Overall result: ❌ regressions - no action needed

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR may lead to changes in compiler perf.

@bors rollup=never
@rustbot label: -S-waiting-on-perf -perf-regression

Instruction count

This is a highly reliable metric that was used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	-	-	0
Regressions ❌ (secondary)	0.5%	[0.3%, 0.5%]	4
Improvements ✅ (primary)	-	-	0
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-	-	0

Max RSS (memory usage)

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	3.2%	[3.2%, 3.2%]	1
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-	-	0
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	3.2%	[3.2%, 3.2%]	1

Cycles

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	-	-	0
Regressions ❌ (secondary)	2.1%	[1.8%, 2.3%]	9
Improvements ✅ (primary)	-	-	0
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-	-	0

Binary size

This benchmark run did not return any relevant results for this metric.

Bootstrap: 648.85s -> 647.756s (-0.17%)

erikdesjardins · 2023-06-11T18:45:43Z

From a local cachegrind diff that looks like inlining noise:

--------------------------------------------------------------------------------
Ir       file:function
--------------------------------------------------------------------------------
 58,613  ???:rustc_middle::ty::codec::encode_with_shorthand::<rustc_middle::query::on_disk_cache::CacheEncoder, rustc_middle::ty::Ty, <rustc_middle::query::on_disk_cache::CacheEncoder as rustc_type_ir::codec::TyEncoder>::type_shorthands>
-43,266  ???:<[rustc_middle::mir::LocalDecl] as rustc_serialize::serialize::Encodable<rustc_middle::query::on_disk_cache::CacheEncoder>>::encode
 23,234  ???:<rustc_metadata::creader::CStore as rustc_session::cstore::CrateStore>::def_path_hash
-18,325  ???:<rustc_span::def_id::DefId as rustc_data_structures::stable_hasher::HashStable<rustc_query_system::ich::hcx::StableHashingContext>>::hash_stable
-13,446  ./elf/dl-lookup.c:_dl_lookup_symbol_x
-12,600  ???:<alloc::vec::Vec<rustc_middle::ty::adjustment::Adjustment> as rustc_serialize::serialize::Decodable<rustc_middle::query::on_disk_cache::CacheDecoder>>::decode
 11,456  ???:<std::collections::hash::map::HashMap<rustc_hir::hir_id::ItemLocalId, alloc::vec::Vec<rustc_middle::ty::adjustment::Adjustment>, core::hash::BuildHasherDefault<rustc_hash::FxHasher>> as rustc_serialize::serialize::Decodable<rustc_middle::query::on_disk_cache::CacheDecoder>>::decode
-11,256  ???:<&mut <rustc_middle::mir::syntax::Place as rustc_serialize::serialize::Decodable<rustc_middle::query::on_disk_cache::CacheDecoder>>::decode::{closure
 11,256  ???:<rustc_middle::mir::syntax::ProjectionElem<rustc_middle::mir::Local, rustc_middle::ty::Ty> as rustc_serialize::serialize::Decodable<rustc_middle::query::on_disk_cache::CacheDecoder>>::decode
...

It seems like await-call-tree is noisy in general:

davidtwco

LGTM

davidtwco · 2023-06-27T14:32:40Z

@bors r+

bors · 2023-06-27T14:32:42Z

📌 Commit bd0aae9 has been approved by davidtwco

It is now in the queue for this repository.

bors · 2023-06-27T15:01:59Z

⌛ Testing commit bd0aae9 with merge 3c554f5...

bors · 2023-06-27T17:54:18Z

☀️ Test successful - checks-actions
Approved by: davidtwco
Pushing 3c554f5 to master...

cg_llvm: use index-based loop in write_operand_repeatedly

bd0aae9

This is easier for LLVM to analyze.

rustbot assigned davidtwco Jun 11, 2023

rustbot added S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. labels Jun 11, 2023

caojoshua reviewed Jun 11, 2023

View reviewed changes

caojoshua approved these changes Jun 11, 2023

View reviewed changes

This comment has been minimized.

Sign in to view

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Jun 11, 2023

This comment has been minimized.

Sign in to view

rustbot removed the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Jun 11, 2023

davidtwco approved these changes Jun 27, 2023

View reviewed changes

bors added S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. and removed S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. labels Jun 27, 2023

bors added the merged-by-bors This PR was explicitly merged by bors. label Jun 27, 2023

bors merged commit 3c554f5 into rust-lang:master Jun 27, 2023

rustbot added this to the 1.72.0 milestone Jun 27, 2023

erikdesjardins deleted the loop branch July 1, 2023 21:20

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cg_llvm: use index-based loop in write_operand_repeatedly #112516

cg_llvm: use index-based loop in write_operand_repeatedly #112516

erikdesjardins commented Jun 11, 2023 •

edited

Loading

rustbot commented Jun 11, 2023

caojoshua Jun 11, 2023

erikdesjardins Jun 11, 2023 •

edited

Loading

caojoshua Jun 11, 2023

erikdesjardins Jun 11, 2023 •

edited

Loading

caojoshua Jun 11, 2023

erikdesjardins Jun 11, 2023 •

edited

Loading

caojoshua Jun 11, 2023

caojoshua commented Jun 11, 2023

caojoshua left a comment

the8472 commented Jun 11, 2023

This comment has been minimized.

bors commented Jun 11, 2023

bors commented Jun 11, 2023

This comment has been minimized.

rust-timer commented Jun 11, 2023

erikdesjardins commented Jun 11, 2023

davidtwco left a comment

davidtwco commented Jun 27, 2023

bors commented Jun 27, 2023

bors commented Jun 27, 2023

bors commented Jun 27, 2023

	pub fn project_index<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
	&self,
	bx: &mut Bx,
	llindex: V,
	) -> Self {
	// Statically compute the offset if we can, otherwise just use the element size,
	// as this will yield the lowest alignment.
	let layout = self.layout.field(bx, 0);
	let offset = if let Some(llindex) = bx.const_to_opt_uint(llindex) {
	layout.size.checked_mul(llindex, bx).unwrap_or(layout.size)
	} else {
	layout.size
	};

	PlaceRef {
	llval: bx.inbounds_gep(
	bx.cx().backend_type(self.layout),
	self.llval,
	&[bx.cx().const_usize(0), llindex],
	),
	llextra: None,
	layout,
	align: self.align.restrict_for_offset(offset),
	}
	}

		@@ -5,6 +5,18 @@

		use std::sync::Arc;

		// CHECK-LABEL: @new_from_array

cg_llvm: use index-based loop in write_operand_repeatedly #112516

cg_llvm: use index-based loop in write_operand_repeatedly #112516

Conversation

erikdesjardins commented Jun 11, 2023 • edited Loading

rustbot commented Jun 11, 2023

Choose a reason for hiding this comment

erikdesjardins Jun 11, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

erikdesjardins Jun 11, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

erikdesjardins Jun 11, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

caojoshua commented Jun 11, 2023

caojoshua left a comment

Choose a reason for hiding this comment

the8472 commented Jun 11, 2023

This comment has been minimized.

bors commented Jun 11, 2023

bors commented Jun 11, 2023

This comment has been minimized.

rust-timer commented Jun 11, 2023

Overall result: ❌ regressions - no action needed

erikdesjardins commented Jun 11, 2023

davidtwco left a comment

Choose a reason for hiding this comment

davidtwco commented Jun 27, 2023

bors commented Jun 27, 2023

bors commented Jun 27, 2023

bors commented Jun 27, 2023

erikdesjardins commented Jun 11, 2023 •

edited

Loading

erikdesjardins Jun 11, 2023 •

edited

Loading

erikdesjardins Jun 11, 2023 •

edited

Loading

erikdesjardins Jun 11, 2023 •

edited

Loading