LLVM/Test: Add vectorizing testcases for fminimumnum and fminimumnum (#133843)

Vectorizing of fminimumnum and fminimumnum have not support yet. Let's add the testcase for it now, and we will update the testcase when we support it.
2025-04-02 08:46:02 +08:00
parent ad1ca5f4a2
commit e25187bc3e
6 changed files with 2307 additions and 0 deletions
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
+; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s
+
+define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin32(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[OUT:%.*]] = tail call float @llvm.minimumnum.f32(float [[IN1]], float [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[OUT]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv
+  %in1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv
+  %in2 = load float, ptr %arrayidx2, align 4
+  %out = tail call float @llvm.minimumnum.f32(float %in1, float %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv
+  store float %out, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax32(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[OUT:%.*]] = tail call float @llvm.maximumnum.f32(float [[IN1]], float [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[OUT]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv
+  %in1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv
+  %in2 = load float, ptr %arrayidx2, align 4
+  %out = tail call float @llvm.maximumnum.f32(float %in1, float %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv
+  store float %out, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin64(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[OUT:%.*]] = tail call double @llvm.minimumnum.f64(double [[IN1]], double [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[OUT]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv
+  %in1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv
+  %in2 = load double, ptr %arrayidx2, align 8
+  %out = tail call double @llvm.minimumnum.f64(double %in1, double %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv
+  store double %out, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax64(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[OUT:%.*]] = tail call double @llvm.maximumnum.f64(double [[IN1]], double [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[OUT]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv
+  %in1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv
+  %in2 = load double, ptr %arrayidx2, align 8
+  %out = tail call double @llvm.maximumnum.f64(double %in1, double %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv
+  store double %out, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin16(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[OUT:%.*]] = tail call half @llvm.minimumnum.f16(half [[IN1]], half [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[OUT]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv
+  %in1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv
+  %in2 = load half, ptr %arrayidx2, align 2
+  %out = tail call half @llvm.minimumnum.f16(half %in1, half %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv
+  store half %out, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax16(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[OUT:%.*]] = tail call half @llvm.maximumnum.f16(half [[IN1]], half [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[OUT]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv
+  %in1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv
+  %in2 = load half, ptr %arrayidx2, align 2
+  %out = tail call half @llvm.maximumnum.f16(half %in1, half %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv
+  store half %out, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
--- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
+; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s
+
+define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin32(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[OUT:%.*]] = tail call float @llvm.minimumnum.f32(float [[IN1]], float [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[OUT]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv
+  %in1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv
+  %in2 = load float, ptr %arrayidx2, align 4
+  %out = tail call float @llvm.minimumnum.f32(float %in1, float %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv
+  store float %out, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax32(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[OUT:%.*]] = tail call float @llvm.maximumnum.f32(float [[IN1]], float [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[OUT]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv
+  %in1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv
+  %in2 = load float, ptr %arrayidx2, align 4
+  %out = tail call float @llvm.maximumnum.f32(float %in1, float %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv
+  store float %out, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin64(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[OUT:%.*]] = tail call double @llvm.minimumnum.f64(double [[IN1]], double [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[OUT]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv
+  %in1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv
+  %in2 = load double, ptr %arrayidx2, align 8
+  %out = tail call double @llvm.minimumnum.f64(double %in1, double %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv
+  store double %out, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax64(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[OUT:%.*]] = tail call double @llvm.maximumnum.f64(double [[IN1]], double [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[OUT]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv
+  %in1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv
+  %in2 = load double, ptr %arrayidx2, align 8
+  %out = tail call double @llvm.maximumnum.f64(double %in1, double %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv
+  store double %out, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin16(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[OUT:%.*]] = tail call half @llvm.minimumnum.f16(half [[IN1]], half [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[OUT]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv
+  %in1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv
+  %in2 = load half, ptr %arrayidx2, align 2
+  %out = tail call half @llvm.minimumnum.f16(half %in1, half %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv
+  store half %out, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax16(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[OUT:%.*]] = tail call half @llvm.maximumnum.f16(half [[IN1]], half [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[OUT]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv
+  %in1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv
+  %in2 = load half, ptr %arrayidx2, align 2
+  %out = tail call half @llvm.maximumnum.f16(half %in1, half %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv
+  store half %out, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
--- a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
+; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s
+
+define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin32(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[OUT:%.*]] = tail call float @llvm.minimumnum.f32(float [[IN1]], float [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[OUT]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv
+  %in1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv
+  %in2 = load float, ptr %arrayidx2, align 4
+  %out = tail call float @llvm.minimumnum.f32(float %in1, float %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv
+  store float %out, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax32(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[OUT:%.*]] = tail call float @llvm.maximumnum.f32(float [[IN1]], float [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[OUT]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv
+  %in1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv
+  %in2 = load float, ptr %arrayidx2, align 4
+  %out = tail call float @llvm.maximumnum.f32(float %in1, float %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv
+  store float %out, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin64(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[OUT:%.*]] = tail call double @llvm.minimumnum.f64(double [[IN1]], double [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[OUT]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv
+  %in1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv
+  %in2 = load double, ptr %arrayidx2, align 8
+  %out = tail call double @llvm.minimumnum.f64(double %in1, double %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv
+  store double %out, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax64(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[OUT:%.*]] = tail call double @llvm.maximumnum.f64(double [[IN1]], double [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[OUT]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv
+  %in1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv
+  %in2 = load double, ptr %arrayidx2, align 8
+  %out = tail call double @llvm.maximumnum.f64(double %in1, double %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv
+  store double %out, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmin16(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[OUT:%.*]] = tail call half @llvm.minimumnum.f16(half [[IN1]], half [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[OUT]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv
+  %in1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv
+  %in2 = load half, ptr %arrayidx2, align 2
+  %out = tail call half @llvm.minimumnum.f16(half %in1, half %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv
+  store half %out, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) {
+; CHECK-LABEL: define void @fmax16(
+; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[OUT:%.*]] = tail call half @llvm.maximumnum.f16(half [[IN1]], half [[IN2]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[OUT]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv
+  %in1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv
+  %in2 = load half, ptr %arrayidx2, align 2
+  %out = tail call half @llvm.maximumnum.f16(half %in1, half %in2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv
+  store half %out, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4096
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
@@ -0,0 +1,516 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=slp-vectorizer --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s
+
+@input1_f32 = global [9 x float] zeroinitializer, align 16
+@input2_f32 = global [9 x float] zeroinitializer, align 16
+@output_f32 = global [9 x float] zeroinitializer, align 16
+@input1_f64 = global [9 x double] zeroinitializer, align 16
+@input2_f64 = global [9 x double] zeroinitializer, align 16
+@output_f64 = global [9 x double] zeroinitializer, align 16
+@input1_f16 = global [9 x half] zeroinitializer, align 16
+@input2_f16 = global [9 x half] zeroinitializer, align 16
+@output_f16 = global [9 x half] zeroinitializer, align 16
+
+define void @fmin32()  {
+; CHECK-LABEL: define void @fmin32(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @fmax32()  {
+; CHECK-LABEL: define void @fmax32(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @fmin64()  {
+; CHECK-LABEL: define void @fmin64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @fmax64()  {
+; CHECK-LABEL: define void @fmax64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @fmin16()  {
+; CHECK-LABEL: define void @fmin16(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @fmax16()  {
+; CHECK-LABEL: define void @fmax16(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
@@ -0,0 +1,516 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=slp-vectorizer --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s
+
+@input1_f32 = global [9 x float] zeroinitializer, align 16
+@input2_f32 = global [9 x float] zeroinitializer, align 16
+@output_f32 = global [9 x float] zeroinitializer, align 16
+@input1_f64 = global [9 x double] zeroinitializer, align 16
+@input2_f64 = global [9 x double] zeroinitializer, align 16
+@output_f64 = global [9 x double] zeroinitializer, align 16
+@input1_f16 = global [9 x half] zeroinitializer, align 16
+@input2_f16 = global [9 x half] zeroinitializer, align 16
+@output_f16 = global [9 x half] zeroinitializer, align 16
+
+define void @fmin32()  {
+; CHECK-LABEL: define void @fmin32(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @fmax32()  {
+; CHECK-LABEL: define void @fmax32(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @fmin64()  {
+; CHECK-LABEL: define void @fmin64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @fmax64()  {
+; CHECK-LABEL: define void @fmax64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @fmin16()  {
+; CHECK-LABEL: define void @fmin16(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @fmax16()  {
+; CHECK-LABEL: define void @fmax16(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
@@ -0,0 +1,510 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=slp-vectorizer --mtriple=x86_64 -S < %s | FileCheck %s
+
+@input1_f32 = global [9 x float] zeroinitializer, align 16
+@input2_f32 = global [9 x float] zeroinitializer, align 16
+@output_f32 = global [9 x float] zeroinitializer, align 16
+@input1_f64 = global [9 x double] zeroinitializer, align 16
+@input2_f64 = global [9 x double] zeroinitializer, align 16
+@output_f64 = global [9 x double] zeroinitializer, align 16
+@input1_f16 = global [9 x half] zeroinitializer, align 16
+@input2_f16 = global [9 x half] zeroinitializer, align 16
+@output_f16 = global [9 x half] zeroinitializer, align 16
+
+define void @fmin32()  {
+; CHECK-LABEL: define void @fmin32() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @fmax32()  {
+; CHECK-LABEL: define void @fmax32() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @fmin64()  {
+; CHECK-LABEL: define void @fmin64() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @fmax64()  {
+; CHECK-LABEL: define void @fmax64() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @fmin16()  {
+; CHECK-LABEL: define void @fmin16() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @fmax16()  {
+; CHECK-LABEL: define void @fmax16() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)