Consider the following CUDA code:
enum { p = 5 };
__device__ float adjust_mul(float x) { return x * (1 << p); }
__device__ float adjust_ldexpadjust_ldexpf(float x) { return ldexpf(x, p); }
I would expect NVCC to optimize the first and the second function into the exact same code, as ldexpldexpf(x, p) is defined to "multiply x by 2 to the power of p".
but the PTX code for these two functions is quite differentquite different (GodBolt)... the first function becomes:
.visible .func (.param .b32 func_retval0) adjust_mul(float)(
.param .b32 adjust_mul(float)_param_0
)
{
ld.param.f32 %f1, [adjust_mul(float)_param_0];
mul.f32 %f2, %f1, 0f42000000;
st.param.f32 [func_retval0+0], %f2;
ret;
}
and the second becomes:
.visible .func (.param .b32 func_retval0) adjust_ldexpadjust_ldexpf(float)(
.param .b32 adjust_ldexpadjust_ldexpf(float)_param_0
)
{
ld.param.f32 %f5, [adjust_ldexp[adjust_ldexpf(float)_param_0];
abs.f32 %f1, %f5;
setp.eq.f32 %p1, %f1, 0f00000000;
setp.eq.f32 %p2, %f1, 0f7F800000;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB1_2;
bra.uni $L__BB1_1;
$L__BB1_2:
setp.gt.f32 %p4, %f1, 0f00000000;
add.f32 %f8, %f5, %f5;
selp.f32 %f9, %f5, %f8, %p4;
bra.uni $L__BB1_3;
$L__BB1_1:
mov.f32 %f6, 0f40A00000;
ex2.approx.ftz.f32 %f7, %f6;
mul.f32 %f9, %f7, %f5;
$L__BB1_3:
st.param.f32 [func_retval0+0], %f9;
ret;
}
Why doesn't NVCC optimize the second function into the same PTX code as the first one?