1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
//! [F16C intrinsics].
//!
//! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769

use crate::{
    core_arch::{simd::*, x86::*},
    hint::unreachable_unchecked,
    mem::transmute,
};

#[cfg(test)]
use stdarch_test::assert_instr;

#[allow(improper_ctypes)]
extern "unadjusted" {
    #[link_name = "llvm.x86.vcvtph2ps.128"]
    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
    #[link_name = "llvm.x86.vcvtph2ps.256"]
    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
    #[link_name = "llvm.x86.vcvtps2ph.128"]
    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
    #[link_name = "llvm.x86.vcvtps2ph.256"]
    fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
}

/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
/// vector.
#[inline]
#[target_feature(enable = "f16c")]
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 {
    transmute(llvm_vcvtph2ps_128(transmute(a)))
}

/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
#[inline]
#[target_feature(enable = "f16c")]
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
    transmute(llvm_vcvtph2ps_256(transmute(a)))
}

macro_rules! dispatch_rounding {
    ($rounding:ident, $call:ident) => {{
        match $rounding {
            0 => call!(0),
            1 => call!(1),
            2 => call!(2),
            3 => call!(3),
            4 => call!(4),
            5 => call!(5),
            6 => call!(6),
            7 => call!(7),
            _ => unreachable_unchecked(),
        }
    }};
}

/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
/// vector.
///
/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
///
/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
    let a = transmute(a);
    macro_rules! call {
        ($rounding:expr) => {
            llvm_vcvtps2ph_128(a, $rounding)
        };
    }
    transmute(dispatch_rounding!(imm_rounding, call))
}

/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
/// 16-bit half-precision float values stored in a 128-bit wide vector.
///
/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
///
/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i {
    let a = transmute(a);
    macro_rules! call {
        ($rounding:expr) => {
            llvm_vcvtps2ph_256(a, $rounding)
        };
    }
    transmute(dispatch_rounding!(imm_rounding, call))
}

#[cfg(test)]
mod tests {
    use crate::{core_arch::x86::*, mem::transmute};
    use stdarch_test::simd_test;

    #[simd_test(enable = "f16c")]
    unsafe fn test_mm_cvtph_ps() {
        let array = [1_f32, 2_f32, 3_f32, 4_f32];
        let float_vec: __m128 = transmute(array);
        let halfs: __m128i = _mm_cvtps_ph(float_vec, 0);
        let floats: __m128 = _mm_cvtph_ps(halfs);
        let result: [f32; 4] = transmute(floats);
        assert_eq!(result, array);
    }

    #[simd_test(enable = "f16c")]
    unsafe fn test_mm256_cvtph_ps() {
        let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32];
        let float_vec: __m256 = transmute(array);
        let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0);
        let floats: __m256 = _mm256_cvtph_ps(halfs);
        let result: [f32; 8] = transmute(floats);
        assert_eq!(result, array);
    }
}