-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86][AVX10.2] Add comments for the avx10_2convertintrin.h file #120766
base: main
Are you sure you want to change the base?
[X86][AVX10.2] Add comments for the avx10_2convertintrin.h file #120766
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: Mikołaj Piróg (mikolaj-pirog) ChangesAs in title. I will create a sibling pr with comments to the 512 in a separate PR. Patch is 111.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120766.diff 1 Files Affected:
diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
index 134adb2850c8de..52a64a5309763d 100644
--- a/clang/lib/Headers/avx10_2convertintrin.h
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -24,24 +24,146 @@
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
__min_vector_width__(256)))
+/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 128-bit vector
+/// containing FP16 elements.
+///
+/// \code{.operation}
+/// FOR i := 0 to 7
+/// IF i < 4
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __B
+/// A 128-bit vector of [4 x float].
+/// \returns
+/// A 128-bit vector of [8 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A.
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A,
__m128 __B) {
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
(__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1));
}
+/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 128-bit vector
+/// containing FP16 elements. Merging mask \a __U is used to determine if given
+/// element should be taken from \a __W instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 7
+/// IF mask[i]
+/// dst.fp16[i] := __W[i]
+/// ELSE
+/// IF i < 4
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
+/// FI
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __W
+/// A 128-bit vector of [8 x fp16].
+/// \param __U
+/// A 8-bit merging mask.
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __B
+/// A 128-bit vector of [4 x float].
+/// \returns
+/// A 128-bit vector of [8 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A. If corresponding mask bit is set, then
+/// element from \a __W is taken instead.
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
(__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U);
}
+/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 128-bit vector
+/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
+/// element should be zeroed instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 7
+/// IF mask[i]
+/// IF i < 4
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
+/// FI
+/// ELSE
+/// dst.fp16[i] := 0
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __U
+/// A 8-bit zeroing mask.
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __B
+/// A 128-bit vector of [4 x float].
+/// \returns
+/// A 128-bit vector of [8 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A. If corresponding mask bit is not set,
+/// then zero is taken instead.
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
(__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
}
+/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 256-bit vector
+/// containing FP16 elements.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF i < 8
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \returns
+/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A.
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
__m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
@@ -49,6 +171,42 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
_MM_FROUND_CUR_DIRECTION);
}
+/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 256-bit vector
+/// containing FP16 elements. Merging mask \a __U is used to determine if given
+/// element should be taken from \a __W instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF mask[i]
+/// dst.fp16[i] := __W[i]
+/// ELSE
+/// IF i < 8
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
+/// FI
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __W
+/// A 256-bit vector of [16 x fp16].
+/// \param __U
+/// A 16-bit merging mask.
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \returns
+/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A. If corresponding mask bit is set, then
+/// element from \a __W is taken instead.
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
@@ -56,6 +214,40 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
_MM_FROUND_CUR_DIRECTION);
}
+/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 256-bit vector
+/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
+/// element should be zeroed instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF mask[i]
+/// IF i < 8
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
+/// FI
+/// ELSE
+/// dst.fp16[i] := 0
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __U
+/// A 16-bit zeroing mask.
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \returns
+/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A. If corresponding mask bit is not set,
+/// then zero is taken instead.
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
@@ -63,32 +255,233 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
_MM_FROUND_CUR_DIRECTION);
}
+/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 256-bit vector
+/// containing FP16 elements. Rounding mode \a __R needs to be provided.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF i < 8
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \param __R
+/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
+/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
+/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
+/// _MM_FROUND_TO_ZERO.
+/// \returns
+/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A.
#define _mm256_cvtx_round2ps_ph(A, B, R) \
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
(__v8sf)(A), (__v8sf)(B), (__v16hf)_mm256_undefined_ph(), \
(__mmask16)(-1), (const int)(R)))
+/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 256-bit vector
+/// containing FP16 elements. Merging mask \a __U is used to determine if given
+/// element should be taken from \a __W instead. Rounding mode \a __R needs to
+/// be provided.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF mask[i]
+/// dst.fp16[i] := __W[i]
+/// ELSE
+/// IF i < 8
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
+/// FI
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __W
+/// A 256-bit vector of [16 x fp16].
+/// \param __U
+/// A 16-bit merging mask.
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \param __R
+/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
+/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
+/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
+/// _MM_FROUND_TO_ZERO.
+/// \returns
+/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A. If corresponding mask bit is set, then
+/// element from \a __W is taken instead.
#define _mm256_mask_cvtx_round2ps_ph(W, U, A, B, R) \
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
(__v8sf)(A), (__v8sf)(B), (__v16hf)(W), (__mmask16)(U), (const int)(R)))
+/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
+/// single-precision (32-bit) floating-point elements to a 256-bit vector
+/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
+/// element should be zeroed instead. Rounding mode \a __R needs to be provided.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF mask[i]
+/// IF i < 8
+/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
+/// ELSE
+/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
+/// FI
+/// ELSE
+/// dst.fp16[i] := 0
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
+///
+/// \param __U
+/// A 16-bit zeroing mask.
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \param __R
+/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
+/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
+/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
+/// _MM_FROUND_TO_ZERO.
+/// \returns
+/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
+/// (converted) elements from \a __B; higher order elements correspond to the
+/// (converted) elements from \a __A. If corresponding mask bit is not set,
+/// then zero is taken instead.
#define _mm256_maskz_cvtx_round2ps_ph(U, A, B, R) \
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
(__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \
(__mmask16)(U), (const int)(R)))
+/// Add two 128-bit vectors, \a __A and \a __B, containing packed FP16
+/// floating-point elements and 8-bit integers stored in the lower half of
+/// packed 16-bit integers, respectively. Results are converted to FP8 E5M2.
+///
+/// \code{.operation}
+/// FOR i := 0 to 7
+/// dst.fp8[i] := add_convert_fp16_to_fp8_bias(__A.fp16[i], __B.int8[2i])
+/// ENDFOR
+///
+/// dst[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
+///
+/// \param __A
+/// A 128-bit vector of [8 x fp16].
+/// \param __B
+/// A 128-bit vector of [8 x int16].
+/// \returns
+/// A 128-bit vector of [16 x fp8]. Lower elements correspond to the
+/// sum of elements from \a __A and \a __B; higher order elements are zeroed.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtbiasph_pbf8(__m128i __A, __m128h __B) {
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
(__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
}
+/// Add two 128-bit vectors, \a __A and \a __B, containing packed FP16
+/// floating-point elements and 8-bit integers stored in the lower half of
+/// packed 16-bit integers, respectively. Results are converted to FP8 E5M2.
+/// Merging mask \a __U is used to determine if given element should be taken
+/// from \a __W instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 7
+/// IF __U[i]
+/// dst.fp8[i] := _W[i]
+/// ELSE
+/// dst.fp8[i] := add_convert_fp16_to_fp8_bias(__A.fp16[i], __B.int8[2i])
+/// FI
+/// ENDFOR
+///
+/// dst[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
+///
+/// \param __W
+/// A 128-bit vector of [16 x fp8].
+/// \param __U
+/// A 8-bit merging mask.
+/// \param __A
+/// A 128-bit vector of [8 x fp16].
+/// \param __B
+/// A 128-bit vector of [8 x int16].
+/// \returns
+/// A 128-bit vector of [16 x fp8]. Lower elements correspond to the sum of
+/// elements from \a __A and \a __B; higher order elements are zeroed. If
+/// corresponding mask bit is set, then element from \a __W is taken instead.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtbiasph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
}
+/// Add two 128-bit vectors, \a __A and \a __B, containing packed FP16
+/// floating-point elements and 8-bit integers stored in the lower half of
+/// packed 16-bit integers, respectively. Results are converted to FP8 E5M2.
+/// Zeroing mask \a __U is used to determine if given element should be zeroed
+/// instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 7
+/// IF __U[i]
+/// dst.fp8[i] := add_convert_fp16_to_fp8_bias(__A.fp16[i], __B.int8[2i])
+/// ELSE
+/// dst.fp8[i] := 0
+/// FI
+/// ENDFOR
+///
+/// dst[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
+///
+/// \param __U
+/// A 8-bit zeroing mask.
+/// \param __A
+/// A 128-bit vector of [8 x fp16].
+/// \param __B
+/// A 128-bit vector of [8 x int16].
+/// \returns
+/// A 128-bit vector of [16 x fp8]. Lower elements correspond to the sum of
+/// elements from \a __A and \a __B; higher order elements are zeroed. If
+/// corresponding mask bit is not set, then element is zeroed.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtbiasph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) {
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
@@ -96,6 +489,27 @@ _mm_maskz_cvtbiasph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) {
(__mmask8)__U);
}
+/// Add two 256-bit vectors, \a __A and \a __B, containing packed FP16
+/// floating-point elements and 8-bit integers stored in the lower half of
+/// packed 16-bit integers, respectively. Results are converted to FP8 E5M2.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// dst.fp8[i] := add_convert_fp16_to_fp8_bias(__A.fp16[i], __B.int8[2i])
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
+///
+/// \param __A
+/// A 256-bit vector of [16 x fp16].
+/// \param __B
+/// A 256-bit vector of [16 x int16].
+/// \returns
+/// A 128-bit vector of [16 x fp8]. Elements correspond to the
+/// sum of elements from \a __A and \a __B.
static __inline__ __m128i __DEFAULT_FN_ATTRS256
_mm256_cvtbiasph_pbf8(__m256i __A, __m256h __B) {
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
@@ -103,12 +517,74 @@ _mm256_cvtbiasph_pbf8(__m256i __A, __m256h __B) {
(__mmask16)-1);
}
+/// Add two 256-bit vectors, \a __A and \a __B, containing packed FP16
+/// floating-point elements and 8-bit integers stored in the lower half of
+/// packed 16-bit integers, respectively. Results are converted to FP8 E5M2.
+/// Merging mask \a __U is used to determine if given element should be taken
+/// from \a __W instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF __U[i]
+/// dst.fp8[i] := _W[i]
+/// ELSE
+/// dst.fp8[i] := add_convert_fp16_to_fp8_bias(__A.fp16[i], __B.int8[2i])
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
+///
+/// \param __W
+/// A 128-bit vector of [16 x fp8].
+/// \param __U
+/// A 16-bit merging mask.
+/// \param __A
+/// A 256-bit vector of [16 x fp16].
+/// \param __B
+/// A 256-bit vector of [16 x int16].
+/// \returns
+/// A 128-bit vector of [16 x fp8]. Elements correspond to the sum of
+/// elements from \a __A and \a __B. If corresponding mask bit is set, then
+/// element from \a __W is taken instead.
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_pbf8(
__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
}
+/// Add two 256-bit vectors, \a __A and \a __B, containing packed FP16
+/// floating-point elements and 8-bit integers stored in the lower half of
+/// packed 16-bit integers, respectively. Results are converted to FP8 E5M2.
+/// Merging mask \a __U is used to determine if given element should be taken
+/// from \a __W instead.
+///
+/// \code{.operation}
+/// FOR i := 0 to 15
+/// IF __U[i]
+/// dst.fp8[i] := add_convert_fp16_to_fp8_bias(__A.fp16[i], __B.int8[2i])
+/// ELSE
+/// dst.fp8[i] := 0
+/// FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
+///
+/// \param __U
+/// A 16-bit zeroing mask.
+/// \param __A
+/// A 256-bit vector of [16 x fp16].
+/// \param __B
+/// A 256-bit vector of [16 x int16].
+/// \returns
+/// A 128-bit vector of ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
We cannot let clang-format to format it. Maybe use |
31339a1
to
b069337
Compare
Found during review llvm#120766
Found during review llvm#120766
Found during review #120766
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would be the last round review for this patch, but we cannot expect human eyes catch all mistakes. IIRC, the internal tool can verify some document problems. So I suggest to verify it before land.
/// | ||
/// ENDFOR | ||
/// | ||
/// dst[MAX:127] := 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MAX:128
As in title. I will create a sibling pr with comments to the 512 variant.