Skip to main content
added 4 characters in body
Source Link
einpoklum
  • 137.6k
  • 86
  • 448
  • 919

Why does NVCC not optimize ldexpldexpf with a constexpr power-of-two exponent into a simple fmul?

Consider the following CUDA code:

enum { p = 5 };
__device__ float adjust_mul(float x) { return x * (1 << p); }
__device__ float adjust_ldexpadjust_ldexpf(float x) { return ldexpf(x, p); }

I would expect NVCC to optimize the first and the second function into the exact same code, as ldexpldexpf(x, p) is defined to "multiply x by 2 to the power of p".

but the PTX code for these two functions is quite differentquite different (GodBolt)... the first function becomes:

.visible .func  (.param .b32 func_retval0) adjust_mul(float)(
    .param .b32 adjust_mul(float)_param_0
)
{
    ld.param.f32    %f1, [adjust_mul(float)_param_0];
    mul.f32     %f2, %f1, 0f42000000;
    st.param.f32    [func_retval0+0], %f2;
    ret;
}

and the second becomes:

.visible .func  (.param .b32 func_retval0) adjust_ldexpadjust_ldexpf(float)(
    .param .b32 adjust_ldexpadjust_ldexpf(float)_param_0
)
{
    ld.param.f32    %f5, [adjust_ldexp[adjust_ldexpf(float)_param_0];
    abs.f32     %f1, %f5;
    setp.eq.f32     %p1, %f1, 0f00000000;
    setp.eq.f32     %p2, %f1, 0f7F800000;
    or.pred     %p3, %p1, %p2;
    @%p3 bra    $L__BB1_2;
    bra.uni     $L__BB1_1;
$L__BB1_2:
    setp.gt.f32     %p4, %f1, 0f00000000;
    add.f32     %f8, %f5, %f5;
    selp.f32    %f9, %f5, %f8, %p4;
    bra.uni     $L__BB1_3;
$L__BB1_1:
    mov.f32     %f6, 0f40A00000;
    ex2.approx.ftz.f32  %f7, %f6;
    mul.f32     %f9, %f7, %f5;
$L__BB1_3:
    st.param.f32    [func_retval0+0], %f9;
    ret;
}

Why doesn't NVCC optimize the second function into the same PTX code as the first one?

Why does NVCC not optimize ldexp with a constexpr power-of-two exponent into a simple fmul?

Consider the following CUDA code:

enum { p = 5 };
__device__ float adjust_mul(float x) { return x * (1 << p); }
__device__ float adjust_ldexp(float x) { return ldexpf(x, p); }

I would expect NVCC to optimize the first and the second function into the exact same code, as ldexp(x, p) is defined to "multiply x by 2 to the power of p".

but the PTX code for these two functions is quite different (GodBolt)... the first function becomes:

.visible .func  (.param .b32 func_retval0) adjust_mul(float)(
    .param .b32 adjust_mul(float)_param_0
)
{
    ld.param.f32    %f1, [adjust_mul(float)_param_0];
    mul.f32     %f2, %f1, 0f42000000;
    st.param.f32    [func_retval0+0], %f2;
    ret;
}

and the second becomes:

.visible .func  (.param .b32 func_retval0) adjust_ldexp(float)(
    .param .b32 adjust_ldexp(float)_param_0
)
{
    ld.param.f32    %f5, [adjust_ldexp(float)_param_0];
    abs.f32     %f1, %f5;
    setp.eq.f32     %p1, %f1, 0f00000000;
    setp.eq.f32     %p2, %f1, 0f7F800000;
    or.pred     %p3, %p1, %p2;
    @%p3 bra    $L__BB1_2;
    bra.uni     $L__BB1_1;
$L__BB1_2:
    setp.gt.f32     %p4, %f1, 0f00000000;
    add.f32     %f8, %f5, %f5;
    selp.f32    %f9, %f5, %f8, %p4;
    bra.uni     $L__BB1_3;
$L__BB1_1:
    mov.f32     %f6, 0f40A00000;
    ex2.approx.ftz.f32  %f7, %f6;
    mul.f32     %f9, %f7, %f5;
$L__BB1_3:
    st.param.f32    [func_retval0+0], %f9;
    ret;
}

Why doesn't NVCC optimize the second function into the same PTX code as the first one?

Why does NVCC not optimize ldexpf with a constexpr power-of-two exponent into a simple fmul?

Consider the following CUDA code:

enum { p = 5 };
__device__ float adjust_mul(float x) { return x * (1 << p); }
__device__ float adjust_ldexpf(float x) { return ldexpf(x, p); }

I would expect NVCC to optimize the first and the second function into the exact same code, as ldexpf(x, p) is defined to "multiply x by 2 to the power of p".

but the PTX code for these two functions is quite different (GodBolt)... the first function becomes:

.visible .func  (.param .b32 func_retval0) adjust_mul(float)(
    .param .b32 adjust_mul(float)_param_0
)
{
    ld.param.f32    %f1, [adjust_mul(float)_param_0];
    mul.f32     %f2, %f1, 0f42000000;
    st.param.f32    [func_retval0+0], %f2;
    ret;
}

and the second becomes:

.visible .func  (.param .b32 func_retval0) adjust_ldexpf(float)(
    .param .b32 adjust_ldexpf(float)_param_0
)
{
    ld.param.f32    %f5, [adjust_ldexpf(float)_param_0];
    abs.f32     %f1, %f5;
    setp.eq.f32     %p1, %f1, 0f00000000;
    setp.eq.f32     %p2, %f1, 0f7F800000;
    or.pred     %p3, %p1, %p2;
    @%p3 bra    $L__BB1_2;
    bra.uni     $L__BB1_1;
$L__BB1_2:
    setp.gt.f32     %p4, %f1, 0f00000000;
    add.f32     %f8, %f5, %f5;
    selp.f32    %f9, %f5, %f8, %p4;
    bra.uni     $L__BB1_3;
$L__BB1_1:
    mov.f32     %f6, 0f40A00000;
    ex2.approx.ftz.f32  %f7, %f6;
    mul.f32     %f9, %f7, %f5;
$L__BB1_3:
    st.param.f32    [func_retval0+0], %f9;
    ret;
}

Why doesn't NVCC optimize the second function into the same PTX code as the first one?

Source Link
einpoklum
  • 137.6k
  • 86
  • 448
  • 919

Why does NVCC not optimize ldexp with a constexpr power-of-two exponent into a simple fmul?

Consider the following CUDA code:

enum { p = 5 };
__device__ float adjust_mul(float x) { return x * (1 << p); }
__device__ float adjust_ldexp(float x) { return ldexpf(x, p); }

I would expect NVCC to optimize the first and the second function into the exact same code, as ldexp(x, p) is defined to "multiply x by 2 to the power of p".

but the PTX code for these two functions is quite different (GodBolt)... the first function becomes:

.visible .func  (.param .b32 func_retval0) adjust_mul(float)(
    .param .b32 adjust_mul(float)_param_0
)
{
    ld.param.f32    %f1, [adjust_mul(float)_param_0];
    mul.f32     %f2, %f1, 0f42000000;
    st.param.f32    [func_retval0+0], %f2;
    ret;
}

and the second becomes:

.visible .func  (.param .b32 func_retval0) adjust_ldexp(float)(
    .param .b32 adjust_ldexp(float)_param_0
)
{
    ld.param.f32    %f5, [adjust_ldexp(float)_param_0];
    abs.f32     %f1, %f5;
    setp.eq.f32     %p1, %f1, 0f00000000;
    setp.eq.f32     %p2, %f1, 0f7F800000;
    or.pred     %p3, %p1, %p2;
    @%p3 bra    $L__BB1_2;
    bra.uni     $L__BB1_1;
$L__BB1_2:
    setp.gt.f32     %p4, %f1, 0f00000000;
    add.f32     %f8, %f5, %f5;
    selp.f32    %f9, %f5, %f8, %p4;
    bra.uni     $L__BB1_3;
$L__BB1_1:
    mov.f32     %f6, 0f40A00000;
    ex2.approx.ftz.f32  %f7, %f6;
    mul.f32     %f9, %f7, %f5;
$L__BB1_3:
    st.param.f32    [func_retval0+0], %f9;
    ret;
}

Why doesn't NVCC optimize the second function into the same PTX code as the first one?