diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 73544826809c..083c90908836 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -1009,6 +1009,7 @@ to ``float``; see below for more information on this emulation. * 64-bit ARM (AArch64) * RISC-V * X86 (when SSE2 is available) + * LoongArch (For X86, SSE2 is available on 64-bit and all recent 32-bit processors.) diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h index 11636fa55cab..3ad5abca927b 100644 --- a/clang/lib/Basic/Targets/LoongArch.h +++ b/clang/lib/Basic/Targets/LoongArch.h @@ -49,6 +49,9 @@ public: HasFeatureLD_SEQ_SA = false; HasFeatureDiv32 = false; HasFeatureSCQ = false; + BFloat16Width = 16; + BFloat16Align = 16; + BFloat16Format = &llvm::APFloat::BFloat(); LongDoubleWidth = 128; LongDoubleAlign = 128; LongDoubleFormat = &llvm::APFloat::IEEEquad(); @@ -99,6 +102,8 @@ public: bool hasBitIntType() const override { return true; } + bool hasBFloat16Type() const override { return true; } + bool useFP16ConversionIntrinsics() const override { return false; } bool handleTargetFeatures(std::vector &Features, diff --git a/clang/test/CodeGen/LoongArch/bfloat-abi.c b/clang/test/CodeGen/LoongArch/bfloat-abi.c new file mode 100644 index 000000000000..a8a252919ef3 --- /dev/null +++ b/clang/test/CodeGen/LoongArch/bfloat-abi.c @@ -0,0 +1,532 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// RUN: %clang_cc1 -triple loongarch64 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LA64 +// RUN: %clang_cc1 -triple loongarch32 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LA32 + +struct bfloat1 { + __bf16 a; +}; + +// CHECK-LABEL: define dso_local bfloat @h1 +// CHECK-SAME: (bfloat noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT1:%.*]], align 2 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT1]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { bfloat }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[TMP1]], align 2 +// CHECK-NEXT: ret bfloat [[TMP2]] +// +struct bfloat1 h1(__bf16 a) { + struct bfloat1 x; + x.a = a; + return x; +} + +struct bfloat2 { + __bf16 a; + __bf16 b; +}; + +// CHECK-LABEL: define dso_local { bfloat, bfloat } @h2 +// CHECK-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2:%.*]], align 2 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { bfloat, bfloat } poison, bfloat [[TMP3]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { bfloat, bfloat } [[TMP6]], bfloat [[TMP5]], 1 +// CHECK-NEXT: ret { bfloat, bfloat } [[TMP7]] +// +struct bfloat2 h2(__bf16 a, __bf16 b) { + struct bfloat2 x; + x.a = a; + x.b = b; + return x; +} + +struct bfloat3 { + __bf16 a; + __bf16 b; + __bf16 c; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @h3 +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[RETVAL_COERCE:%.*]] = alloca i64, align 8 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 6, i1 false) +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load i64, ptr [[RETVAL_COERCE]], align 8 +// CHECK-LA64-NEXT: ret i64 [[TMP3]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @h3 +// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[RETVAL_COERCE:%.*]] = alloca [2 x i32], align 4 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i32 6, i1 false) +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL_COERCE]], align 4 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP3]] +// +struct bfloat3 h3(__bf16 a, __bf16 b, __bf16 c) { + struct bfloat3 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct bfloat4 { + __bf16 a; + __bf16 b; + __bf16 c; + __bf16 d; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @h4 +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA64-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA64-NEXT: [[TMP4:%.*]] = load i64, ptr [[RETVAL]], align 2 +// CHECK-LA64-NEXT: ret i64 [[TMP4]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @h4 +// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA32-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA32-NEXT: [[TMP4:%.*]] = load [2 x i32], ptr [[RETVAL]], align 2 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP4]] +// +struct bfloat4 h4(__bf16 a, __bf16 b, __bf16 c, __bf16 d) { + struct bfloat4 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct floatbfloat { + float a; + __bf16 b; +}; + +// CHECK-LABEL: define dso_local { float, bfloat } @fh +// CHECK-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT:%.*]], align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { float, bfloat }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { float, bfloat }, ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { float, bfloat } poison, float [[TMP3]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { float, bfloat } [[TMP6]], bfloat [[TMP5]], 1 +// CHECK-NEXT: ret { float, bfloat } [[TMP7]] +// +struct floatbfloat fh(float a, __bf16 b) { + struct floatbfloat x; + x.a = a; + x.b = b; + return x; +} + +struct floatbfloat2 { + float a; + __bf16 b; + __bf16 c; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @fh2 +// CHECK-LA64-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT2:%.*]], align 4 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load i64, ptr [[RETVAL]], align 4 +// CHECK-LA64-NEXT: ret i64 [[TMP3]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @fh2 +// CHECK-LA32-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT2:%.*]], align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL]], align 4 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP3]] +// +struct floatbfloat2 fh2(float a, __bf16 b, __bf16 c) { + struct floatbfloat2 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct bfloatfloat { + __bf16 a; + float b; +}; + +// CHECK-LABEL: define dso_local { bfloat, float } @hf +// CHECK-SAME: (bfloat noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOATFLOAT:%.*]], align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store float [[B]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOATFLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOATFLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: store float [[TMP1]], ptr [[B2]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, float }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, float }, ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { bfloat, float } poison, bfloat [[TMP3]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { bfloat, float } [[TMP6]], float [[TMP5]], 1 +// CHECK-NEXT: ret { bfloat, float } [[TMP7]] +// +struct bfloatfloat hf(__bf16 a, float b) { + struct bfloatfloat x; + x.a = a; + x.b = b; + return x; +} + +struct bfloat2float { + __bf16 a; + __bf16 b; + float c; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @h2f +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2FLOAT:%.*]], align 4 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store float [[C]], ptr [[C_ADDR]], align 4 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load float, ptr [[C_ADDR]], align 4 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store float [[TMP2]], ptr [[C3]], align 4 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load i64, ptr [[RETVAL]], align 4 +// CHECK-LA64-NEXT: ret i64 [[TMP3]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @h2f +// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2FLOAT:%.*]], align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store float [[C]], ptr [[C_ADDR]], align 4 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load float, ptr [[C_ADDR]], align 4 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store float [[TMP2]], ptr [[C3]], align 4 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL]], align 4 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP3]] +// +struct bfloat2float h2f(__bf16 a, __bf16 b, float c) { + struct bfloat2float x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct floatbfloat3 { + float a; + __bf16 b; + __bf16 c; + __bf16 d; +}; + +// CHECK-LA64-LABEL: define dso_local [2 x i64] @fh3 +// CHECK-LA64-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT3:%.*]], align 4 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[RETVAL_COERCE:%.*]] = alloca [2 x i64], align 8 +// CHECK-LA64-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA64-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 4 +// CHECK-LA64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 4 [[RETVAL]], i64 12, i1 false) +// CHECK-LA64-NEXT: [[TMP4:%.*]] = load [2 x i64], ptr [[RETVAL_COERCE]], align 8 +// CHECK-LA64-NEXT: ret [2 x i64] [[TMP4]] +// +// CHECK-LA32-LABEL: define dso_local void @fh3 +// CHECK-LA32-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_FLOATBFLOAT3:%.*]]) align 4 [[AGG_RESULT:%.*]], float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// CHECK-LA32-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 0 +// CHECK-LA32-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 3 +// CHECK-LA32-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 4 +// CHECK-LA32-NEXT: ret void +// +struct floatbfloat3 fh3(float a, __bf16 b, __bf16 c, __bf16 d) { + struct floatbfloat3 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct bfloat5 { + __bf16 a; + __bf16 b; + __bf16 c; + __bf16 d; + __bf16 e; +}; + +// CHECK-LA64-LABEL: define dso_local [2 x i64] @h5 +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]], bfloat noundef [[E:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT5:%.*]], align 2 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[E_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[RETVAL_COERCE:%.*]] = alloca [2 x i64], align 8 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[E]], ptr [[E_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA64-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA64-NEXT: [[TMP4:%.*]] = load bfloat, ptr [[E_ADDR]], align 2 +// CHECK-LA64-NEXT: [[E5:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 4 +// CHECK-LA64-NEXT: store bfloat [[TMP4]], ptr [[E5]], align 2 +// CHECK-LA64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 10, i1 false) +// CHECK-LA64-NEXT: [[TMP5:%.*]] = load [2 x i64], ptr [[RETVAL_COERCE]], align 8 +// CHECK-LA64-NEXT: ret [2 x i64] [[TMP5]] +// +// CHECK-LA32-LABEL: define dso_local void @h5 +// CHECK-LA32-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_BFLOAT5:%.*]]) align 2 [[AGG_RESULT:%.*]], bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]], bfloat noundef [[E:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[E_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[E]], ptr [[E_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 3 +// CHECK-LA32-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA32-NEXT: [[TMP4:%.*]] = load bfloat, ptr [[E_ADDR]], align 2 +// CHECK-LA32-NEXT: [[E5:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 4 +// CHECK-LA32-NEXT: store bfloat [[TMP4]], ptr [[E5]], align 2 +// CHECK-LA32-NEXT: ret void +// +struct bfloat5 h5(__bf16 a, __bf16 b, __bf16 c, __bf16 d, __bf16 e) { + struct bfloat5 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + x.e = e; + return x; +} diff --git a/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp b/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp new file mode 100644 index 000000000000..de4a10dbe44b --- /dev/null +++ b/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp @@ -0,0 +1,12 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// RUN: %clang_cc1 -triple loongarch64 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple loongarch32 -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define dso_local void @_Z3fooDF16b +// CHECK-SAME: (bfloat noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-NEXT: ret void +// +void foo(__bf16 b) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 8c0d92ab8cd6..b968e051acb0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -182,6 +182,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicF()) { setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); + setTruncStoreAction(MVT::f32, MVT::bf16, Expand); setCondCodeAction(FPCCToExpand, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); @@ -203,6 +205,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, Subtarget.isSoftFPABI() ? LibCall : Custom); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Subtarget.isSoftFPABI() ? LibCall : Custom); + setOperationAction(ISD::BF16_TO_FP, MVT::f32, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::f32, + Subtarget.isSoftFPABI() ? LibCall : Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f32, Legal); @@ -221,6 +226,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicD()) { setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); + setTruncStoreAction(MVT::f64, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setCondCodeAction(FPCCToExpand, MVT::f64, Expand); @@ -243,6 +250,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Subtarget.isSoftFPABI() ? LibCall : Custom); + setOperationAction(ISD::BF16_TO_FP, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::f64, + Subtarget.isSoftFPABI() ? LibCall : Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -499,6 +509,10 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerFP_TO_FP16(Op, DAG); case ISD::FP16_TO_FP: return lowerFP16_TO_FP(Op, DAG); + case ISD::FP_TO_BF16: + return lowerFP_TO_BF16(Op, DAG); + case ISD::BF16_TO_FP: + return lowerBF16_TO_FP(Op, DAG); } return SDValue(); } @@ -2333,6 +2347,36 @@ SDValue LoongArchTargetLowering::lowerFP16_TO_FP(SDValue Op, return Res; } +SDValue LoongArchTargetLowering::lowerFP_TO_BF16(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasBasicF() && "Unexpected custom legalization"); + SDLoc DL(Op); + MakeLibCallOptions CallOptions; + RTLIB::Libcall LC = + RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16); + SDValue Res = + makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first; + if (Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Res); + return DAG.getBitcast(MVT::i32, Res); +} + +SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasBasicF() && "Unexpected custom legalization"); + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + Op = DAG.getNode( + ISD::SHL, DL, Op.getOperand(0).getValueType(), Op.getOperand(0), + DAG.getShiftAmountConstant(16, Op.getOperand(0).getValueType(), DL)); + SDValue Res = Subtarget.is64Bit() ? DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, + DL, MVT::f32, Op) + : DAG.getBitcast(MVT::f32, Op); + if (VT != MVT::f32) + return DAG.getNode(ISD::FP_EXTEND, DL, VT, Res); + return Res; +} + static bool isConstantOrUndef(const SDValue Op) { if (Op->isUndef()) return true; @@ -7993,8 +8037,9 @@ bool LoongArchTargetLowering::splitValueIntoRegisterParts( bool IsABIRegCopy = CC.has_value(); EVT ValueVT = Val.getValueType(); - if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) { - // Cast the f16 to i16, extend to i32, pad with ones to make a float + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { + // Cast the [b]f16 to i16, extend to i32, pad with ones to make a float // nan, and cast to f32. Val = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Val); Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Val); @@ -8013,10 +8058,11 @@ SDValue LoongArchTargetLowering::joinRegisterPartsIntoValue( MVT PartVT, EVT ValueVT, std::optional CC) const { bool IsABIRegCopy = CC.has_value(); - if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) { + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { SDValue Val = Parts[0]; - // Cast the f32 to i32, truncate to i16, and cast back to f16. + // Cast the f32 to i32, truncate to i16, and cast back to [b]f16. Val = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Val); Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Val); Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 4b6d3272db2c..53e3f1adb8d2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -373,6 +373,8 @@ private: SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/test/CodeGen/LoongArch/bf16-promote.ll b/llvm/test/CodeGen/LoongArch/bf16-promote.ll new file mode 100644 index 000000000000..42651eb53ace --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/bf16-promote.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64d < %s | FileCheck --check-prefixes=CHECK,LA64 %s +; RUN: llc -mtriple=loongarch32 -mattr=+d -target-abi=ilp32d < %s | FileCheck --check-prefixes=CHECK,LA32 %s + +define void @test_load_store(ptr %p, ptr %q) nounwind { +; CHECK-LABEL: test_load_store: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.h $a0, $a0, 0 +; CHECK-NEXT: st.h $a0, $a1, 0 +; CHECK-NEXT: ret + %a = load bfloat, ptr %p + store bfloat %a, ptr %q + ret void +} + +define float @test_fpextend_float(ptr %p) nounwind { +; LA64-LABEL: test_fpextend_float: +; LA64: # %bb.0: +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fpextend_float: +; LA32: # %bb.0: +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: ret + %a = load bfloat, ptr %p + %r = fpext bfloat %a to float + ret float %r +} + +define double @test_fpextend_double(ptr %p) nounwind { +; LA64-LABEL: test_fpextend_double: +; LA64: # %bb.0: +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: fcvt.d.s $fa0, $fa0 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fpextend_double: +; LA32: # %bb.0: +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: fcvt.d.s $fa0, $fa0 +; LA32-NEXT: ret + %a = load bfloat, ptr %p + %r = fpext bfloat %a to double + ret double %r +} + +define void @test_fptrunc_float(float %f, ptr %p) nounwind { +; LA64-LABEL: test_fptrunc_float: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fptrunc_float: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %a = fptrunc float %f to bfloat + store bfloat %a, ptr %p + ret void +} + +define void @test_fptrunc_double(double %d, ptr %p) nounwind { +; LA64-LABEL: test_fptrunc_double: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fptrunc_double: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: bl __truncdfbf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %a = fptrunc double %d to bfloat + store bfloat %a, ptr %p + ret void +} + +define void @test_fadd(ptr %p, ptr %q) nounwind { +; LA64-LABEL: test_fadd: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: ld.hu $a1, $a1, 0 +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fadd: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: ld.hu $a1, $a1, 0 +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a1 +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: movgr2fr.w $fa1, $a0 +; LA32-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %a = load bfloat, ptr %p + %b = load bfloat, ptr %q + %r = fadd bfloat %a, %b + store bfloat %r, ptr %p + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/bf16.ll b/llvm/test/CodeGen/LoongArch/bf16.ll new file mode 100644 index 000000000000..e580bcc69f52 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/bf16.ll @@ -0,0 +1,1048 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;; For `double` parameters and return values, compiling on loongarch32 with `-mattr=+d` and +;; `-target-abi=ilp32s` is incompatible, resulting in the error 'Passing f64 with GPR on LA32 is undefined'. +;; Therefore, such case is currently skipped in testing. +; RUN: llc -mtriple=loongarch32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64 +; RUN: llc -mtriple=loongarch32 -mattr=+f -target-abi=ilp32s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32F-ILP32S +; RUN: llc -mtriple=loongarch32 -mattr=+f -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32F-ILP32D +; RUN: llc -mtriple=loongarch32 -mattr=+d -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32D-ILP32D +; RUN: llc -mtriple=loongarch64 -mattr=+f -target-abi=lp64s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64F-LP64S +; RUN: llc -mtriple=loongarch64 -mattr=+f -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64F-LP64D +; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64D-LP64S +; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64D-LP64D + +define bfloat @float_to_bfloat(float %a) nounwind { +; LA32-LABEL: float_to_bfloat: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: float_to_bfloat: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: float_to_bfloat: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: float_to_bfloat: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: float_to_bfloat: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: float_to_bfloat: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: float_to_bfloat: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: float_to_bfloat: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: float_to_bfloat: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fptrunc float %a to bfloat + ret bfloat %1 +} + +define bfloat @double_to_bfloat(double %a) nounwind { +; LA32-LABEL: double_to_bfloat: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl __truncdfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: double_to_bfloat: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: double_to_bfloat: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: bl __truncdfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: double_to_bfloat: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: bl __truncdfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: double_to_bfloat: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: bl __truncdfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: double_to_bfloat: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: double_to_bfloat: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: double_to_bfloat: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: double_to_bfloat: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fptrunc double %a to bfloat + ret bfloat %1 +} + +define float @bfloat_to_float(bfloat %a) nounwind { +; LA32-LABEL: bfloat_to_float: +; LA32: # %bb.0: +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_to_float: +; LA64: # %bb.0: +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_to_float: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_to_float: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_to_float: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_to_float: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_to_float: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_to_float: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_to_float: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ret + %1 = fpext bfloat %a to float + ret float %1 +} + +define double @bfloat_to_double(bfloat %a) nounwind { +; LA32-LABEL: bfloat_to_double: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: bl __extendsfdf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_to_double: +; LA64: # %bb.0: +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: fcvt.d.s $fa0, $fa0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_to_double: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: bl __extendsfdf2 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_to_double: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: bl __extendsfdf2 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_to_double: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: fcvt.d.s $fa0, $fa0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_to_double: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64S-NEXT: fcvt.d.s $fa0, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.d $a0, $fa0 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_to_double: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: fcvt.d.s $fa0, $fa0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_to_double: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64S-NEXT: fcvt.d.s $fa0, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.d $a0, $fa0 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_to_double: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: fcvt.d.s $fa0, $fa0 +; LA64D-LP64D-NEXT: ret + %1 = fpext bfloat %a to double + ret double %1 +} + +define bfloat @i16_to_bfloat(i16 %a) nounwind { +; LA32-LABEL: i16_to_bfloat: +; LA32: # %bb.0: +; LA32-NEXT: ret +; +; LA64-LABEL: i16_to_bfloat: +; LA64: # %bb.0: +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: i16_to_bfloat: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: i16_to_bfloat: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: i16_to_bfloat: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: i16_to_bfloat: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: i16_to_bfloat: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: i16_to_bfloat: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: i16_to_bfloat: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ret + %1 = bitcast i16 %a to bfloat + ret bfloat %1 +} + +define i16 @bfloat_to_i16(bfloat %a) nounwind { +; LA32-LABEL: bfloat_to_i16: +; LA32: # %bb.0: +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_to_i16: +; LA64: # %bb.0: +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_to_i16: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_to_i16: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_to_i16: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_to_i16: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_to_i16: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_to_i16: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_to_i16: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: ret + %1 = bitcast bfloat %a to i16 + ret i16 %1 +} + +define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { +; LA32-LABEL: bfloat_add: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: bl __addsf3 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_add: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: movfr2gr.s $a1, $fa1 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_add: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32S-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_add: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32F-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_add: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32D-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32D-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_add: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_add: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64F-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_add: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_add: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64D-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fadd bfloat %a, %b + ret bfloat %1 +} + +define bfloat @bfloat_load(ptr %a) nounwind { +; LA32-LABEL: bfloat_load: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ld.h $a1, $a0, 0 +; LA32-NEXT: ld.h $a2, $a0, 6 +; LA32-NEXT: slli.w $a0, $a1, 16 +; LA32-NEXT: slli.w $a1, $a2, 16 +; LA32-NEXT: bl __addsf3 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_load: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ld.hu $a1, $a0, 6 +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_load: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: ld.hu $a1, $a0, 6 +; LA32F-ILP32S-NEXT: ld.hu $a0, $a0, 0 +; LA32F-ILP32S-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32S-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_load: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: ld.hu $a1, $a0, 6 +; LA32F-ILP32D-NEXT: ld.hu $a0, $a0, 0 +; LA32F-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_load: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: ld.hu $a1, $a0, 6 +; LA32D-ILP32D-NEXT: ld.hu $a0, $a0, 0 +; LA32D-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32D-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_load: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: ld.hu $a1, $a0, 6 +; LA64F-LP64S-NEXT: ld.hu $a0, $a0, 0 +; LA64F-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_load: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: ld.hu $a1, $a0, 6 +; LA64F-LP64D-NEXT: ld.hu $a0, $a0, 0 +; LA64F-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_load: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: ld.hu $a1, $a0, 6 +; LA64D-LP64S-NEXT: ld.hu $a0, $a0, 0 +; LA64D-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_load: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: ld.hu $a1, $a0, 6 +; LA64D-LP64D-NEXT: ld.hu $a0, $a0, 0 +; LA64D-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = load bfloat, ptr %a + %2 = getelementptr bfloat, ptr %a, i32 3 + %3 = load bfloat, ptr %2 + %4 = fadd bfloat %1, %3 + ret bfloat %4 +} + +define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { +; LA32-LABEL: bfloat_store: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: slli.w $a0, $a1, 16 +; LA32-NEXT: slli.w $a1, $a2, 16 +; LA32-NEXT: bl __addsf3 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: st.h $a0, $fp, 16 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_store: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: movfr2gr.s $a1, $fa1 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: st.h $a0, $fp, 16 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_store: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: move $fp, $a0 +; LA32F-ILP32S-NEXT: slli.w $a0, $a2, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32S-NEXT: slli.w $a0, $a1, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32S-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: st.h $a0, $fp, 0 +; LA32F-ILP32S-NEXT: st.h $a0, $fp, 16 +; LA32F-ILP32S-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_store: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: move $fp, $a0 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32F-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: st.h $a0, $fp, 0 +; LA32F-ILP32D-NEXT: st.h $a0, $fp, 16 +; LA32F-ILP32D-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_store: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: move $fp, $a0 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32D-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32D-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: st.h $a0, $fp, 0 +; LA32D-ILP32D-NEXT: st.h $a0, $fp, 16 +; LA32D-ILP32D-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_store: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: move $fp, $a0 +; LA64F-LP64S-NEXT: slli.d $a0, $a2, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64S-NEXT: slli.d $a0, $a1, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: st.h $a0, $fp, 0 +; LA64F-LP64S-NEXT: st.h $a0, $fp, 16 +; LA64F-LP64S-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_store: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: move $fp, $a0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64F-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: st.h $a0, $fp, 0 +; LA64F-LP64D-NEXT: st.h $a0, $fp, 16 +; LA64F-LP64D-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_store: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: move $fp, $a0 +; LA64D-LP64S-NEXT: slli.d $a0, $a2, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64S-NEXT: slli.d $a0, $a1, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: st.h $a0, $fp, 0 +; LA64D-LP64S-NEXT: st.h $a0, $fp, 16 +; LA64D-LP64S-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_store: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: move $fp, $a0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64D-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: st.h $a0, $fp, 0 +; LA64D-LP64D-NEXT: st.h $a0, $fp, 16 +; LA64D-LP64D-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fadd bfloat %b, %c + store bfloat %1, ptr %a + %2 = getelementptr bfloat, ptr %a, i32 8 + store bfloat %1, ptr %2 + ret void +}