diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 0747eda3addb4..c44c9cf61e8b1 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1607,6 +1607,13 @@ def SYCLDevice : InheritableAttr {
   let Documentation = [SYCLDeviceDocs];
 }
 
+def SYCLDeviceOnly : InheritableAttr {
+  let Spellings = [Clang<"sycl_device_only">];
+  let Subjects = SubjectList<[Function]>;
+  let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+  let Documentation = [SYCLDeviceOnlyDocs];
+}
+
 def SYCLGlobalVar : InheritableAttr {
   let Spellings = [GNU<"sycl_global_var">];
   let Subjects = SubjectList<[GlobalStorageNonLocalVar], ErrorDiag>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 5234c7ee02fff..000a0e522e8cc 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4518,6 +4518,20 @@ implicitly inherit this attribute.
   }];
 }
 
+def SYCLDeviceOnlyDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "sycl_device_only";
+  let Content = [{
+This attribute can only be applied to functions and indicates that the function
+is only available for the device. It allows functions marked with it to
+overload existing functions without the attribute, in which case the overload
+with the attribute will be used on the device side and the overload without
+will be used on the host side. Note: as opposed to ``sycl_device`` this does
+not mark the function as being exported, both attributes can be used together
+if needed.
+  }];
+}
+
 def RISCVInterruptDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "interrupt (RISC-V)";
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index fae15742b52ab..5a5cf72091a88 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3729,6 +3729,13 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
       !(BuiltinID == Builtin::BIprintf || BuiltinID == Builtin::BImalloc))
     return 0;
 
+  // SYCL doesn't have a device-side standard library. SYCLDeviceOnlyAttr may
+  // be used to provide device-side definitions of standard functions, so
+  // anything with that attribute shouldn't be treated as a builtin.
+  if (Context.getLangOpts().isSYCL() && hasAttr<SYCLDeviceOnlyAttr>()) {
+    return 0;
+  }
+
   // As AMDGCN implementation of OpenMP does not have a device-side standard
   // library, none of the predefined library functions except printf and malloc
   // should be treated as a builtin i.e. 0 should be returned for them.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c7db24b8d95ff..57522a049bb00 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2782,10 +2782,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       GenerateIntrinsics =
           ConstWithoutErrnoOrExceptions && ErrnoOverridenToFalseWithOpt;
   }
-  bool IsSYCLDeviceWithoutIntrinsics =
-      getLangOpts().SYCLIsDevice &&
-      (getTarget().getTriple().isNVPTX() || getTarget().getTriple().isAMDGCN());
-  if (GenerateIntrinsics && !IsSYCLDeviceWithoutIntrinsics) {
+  if (GenerateIntrinsics) {
     switch (BuiltinIDIfNoAsmLabel) {
     case Builtin::BIacos:
     case Builtin::BIacosf:
@@ -3885,7 +3882,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_modf:
   case Builtin::BI__builtin_modff:
   case Builtin::BI__builtin_modfl:
-    if (Builder.getIsFPConstrained() || IsSYCLDeviceWithoutIntrinsics)
+    if (Builder.getIsFPConstrained())
       break; // TODO: Emit constrained modf intrinsic once one exists.
     return RValue::get(emitModfBuiltin(*this, E, Intrinsic::modf));
   case Builtin::BI__builtin_isgreater:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b2b1b72454f80..4a077f3fbf5de 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4323,6 +4323,12 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
     }
   }
 
+  // Don't emit 'sycl_device_only' function in SYCL host compilation.
+  if (LangOpts.SYCLIsHost && isa<FunctionDecl>(Global) &&
+      Global->hasAttr<SYCLDeviceOnlyAttr>()) {
+    return;
+  }
+
   if (LangOpts.OpenMP) {
     // If this is OpenMP, check if it is legal to emit this global normally.
     if (OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(GD))
@@ -4412,6 +4418,46 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
     }
   }
 
+  // When using SYCLDeviceOnlyAttr, there can be two functions with the same
+  // mangling, the host function and the device overload. So when compiling for
+  // device we need to make sure we're selecting the SYCLDeviceOnlyAttr
+  // overload and dropping the host overload.
+  if (LangOpts.SYCLIsDevice) {
+    StringRef MangledName = getMangledName(GD);
+    auto DDI = DeferredDecls.find(MangledName);
+    // If we have an existing declaration with the same mangling for this
+    // symbol it may be a SYCLDeviceOnlyAttr case.
+    if (DDI != DeferredDecls.end()) {
+      auto *PreviousGlobal = cast<ValueDecl>(DDI->second.getDecl());
+
+      // If the host declaration was already processed and the device only
+      // declaration is also a sycl external declaration, remove the host
+      // variant and skip. The device only variant will be generated later
+      // as it's marked sycl external.
+      if (!PreviousGlobal->hasAttr<SYCLDeviceOnlyAttr>() &&
+          Global->hasAttr<SYCLDeviceOnlyAttr>() &&
+          Global->hasAttr<SYCLDeviceAttr>()) {
+        DeferredDecls.erase(DDI);
+        return;
+      }
+
+      // If the host declaration was already processed, replace it with the
+      // device only declaration.
+      if (!PreviousGlobal->hasAttr<SYCLDeviceOnlyAttr>() &&
+          Global->hasAttr<SYCLDeviceOnlyAttr>()) {
+        DeferredDecls[MangledName] = GD;
+        return;
+      }
+
+      // If the device only declaration was already processed, skip the
+      // host declaration.
+      if (PreviousGlobal->hasAttr<SYCLDeviceOnlyAttr>() &&
+          !Global->hasAttr<SYCLDeviceOnlyAttr>()) {
+        return;
+      }
+    }
+  }
+
   // clang::ParseAST ensures that we emit the SYCL devices at the end, so
   // anything that is a device (or indirectly called) will be handled later.
   if (LangOpts.SYCLIsDevice && MustBeEmitted(Global)) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 4bdab4b65d375..32c800c247728 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7354,6 +7354,10 @@ static bool isIncompleteDeclExternC(Sema &S, const T *D) {
     if (S.getLangOpts().CUDA && (D->template hasAttr<CUDADeviceAttr>() ||
                                  D->template hasAttr<CUDAHostAttr>()))
       return false;
+
+    // So does SYCL's device_only attribute.
+    if (S.getLangOpts().isSYCL() && D->template hasAttr<SYCLDeviceOnlyAttr>())
+      return false;
   }
   return D->isExternC();
 }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e6f6c547113be..b076cb60db269 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7224,6 +7224,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_SYCLDevice:
     S.SYCL().handleSYCLDeviceAttr(D, AL);
     break;
+  case ParsedAttr::AT_SYCLDeviceOnly:
+    handleSimpleAttribute<SYCLDeviceOnlyAttr>(S, D, AL);
+    break;
   case ParsedAttr::AT_SYCLScope:
     S.SYCL().handleSYCLScopeAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 15294a11d4ecd..91c9106be81c1 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1629,6 +1629,14 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New,
     }
   }
 
+  // Allow overloads with SYCLDeviceOnlyAttr
+  if (SemaRef.getLangOpts().isSYCL()) {
+    if (Old->hasAttr<SYCLDeviceOnlyAttr>() !=
+        New->hasAttr<SYCLDeviceOnlyAttr>()) {
+      return true;
+    }
+  }
+
   // The signatures match; this is not an overload.
   return false;
 }
@@ -11020,6 +11028,15 @@ bool clang::isBetterOverloadCandidate(
            S.CUDA().IdentifyPreference(Caller, Cand2.Function);
   }
 
+  // In SYCL device compilation mode prefer the overload with the
+  // SYCLDeviceOnly attribute.
+  if (S.getLangOpts().SYCLIsDevice && Cand1.Function && Cand2.Function) {
+    if (Cand1.Function->hasAttr<SYCLDeviceOnlyAttr>() !=
+        Cand2.Function->hasAttr<SYCLDeviceOnlyAttr>()) {
+      return Cand1.Function->hasAttr<SYCLDeviceOnlyAttr>();
+    }
+  }
+
   // General member function overloading is handled above, so this only handles
   // constructors with address spaces.
   // This only handles address spaces since C++ has no other
@@ -11374,6 +11391,15 @@ OverloadingResult OverloadCandidateSet::BestViableFunctionImpl(
   if (S.getLangOpts().CUDA)
     CudaExcludeWrongSideCandidates(S, Candidates);
 
+  // In SYCL host compilation remove candidates marked SYCLDeviceOnly.
+  if (S.getLangOpts().SYCLIsHost) {
+    auto IsDeviceCand = [&](const OverloadCandidate *Cand) {
+      return Cand->Viable && Cand->Function &&
+             Cand->Function->hasAttr<SYCLDeviceOnlyAttr>();
+    };
+    llvm::erase_if(Candidates, IsDeviceCand);
+  }
+
   Best = end();
   for (auto *Cand : Candidates) {
     Cand->Best = false;
diff --git a/clang/test/CodeGenSYCL/sycl-device-only.cpp b/clang/test/CodeGenSYCL/sycl-device-only.cpp
new file mode 100644
index 0000000000000..6196f5b6744bb
--- /dev/null
+++ b/clang/test/CodeGenSYCL/sycl-device-only.cpp
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECKD
+// RUN: %clang_cc1 -fsycl-is-host -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECKH
+// Test code generation for sycl_device_only attribute.
+
+// Verify that the device overload is used on device.
+//
+// CHECK-LABEL: _Z3fooi
+// CHECKH: %add = add nsw i32 %0, 10
+// CHECKD: %add = add nsw i32 %0, 20
+int foo(int a) { return a + 10; }
+__attribute__((sycl_device_only)) int foo(int a) { return a + 20; }
+
+// Use a `sycl_device` function as entry point
+__attribute__((sycl_device)) int bar(int b) { return foo(b); }
+
+// Verify that the order of declaration doesn't change the behavior.
+//
+// CHECK-LABEL: _Z3fooswapi
+// CHECKH: %add = add nsw i32 %0, 10
+// CHECKD: %add = add nsw i32 %0, 20
+__attribute__((sycl_device_only)) int fooswap(int a) { return a + 20; }
+int fooswap(int a) { return a + 10; }
+
+// Use a `sycl_device` function as entry point.
+__attribute__((sycl_device)) int barswap(int b) { return fooswap(b); }
+
+// Verify that in extern C the attribute enables mangling.
+extern "C" {
+// CHECK-LABEL: _Z3fooci
+// CHECKH: %add = add nsw i32 %0, 10
+// CHECKD: %add = add nsw i32 %0, 20
+int fooc(int a) { return a + 10; }
+__attribute__((sycl_device_only)) int fooc(int a) { return a + 20; }
+
+// Use a `sycl_device` function as entry point.
+__attribute__((sycl_device)) int barc(int b) { return fooc(b); }
+}
+
+// Verify that both attributes can work together.
+//
+// CHECK-LABEL: _Z3fooai
+// CHECKH: %add = add nsw i32 %0, 10
+// CHECKD: %add = add nsw i32 %0, 20
+int fooa(int a) { return a + 10; }
+__attribute__((sycl_device_only, sycl_device)) int fooa(int a) {
+  return a + 20;
+}
+
+// Use a `sycl_device` function as entry point.
+__attribute__((sycl_device)) int bara(int b) { return fooa(b); }
+
+// Verify that the order of declaration doesn't change the behavior when using
+// both attributes.
+//
+// CHECK-LABEL: _Z3fooaswapi
+// CHECKH: %add = add nsw i32 %0, 10
+// CHECKD: %add = add nsw i32 %0, 20
+__attribute__((sycl_device_only, sycl_device)) int fooaswap(int a) {
+  return a + 20;
+}
+int fooaswap(int a) { return a + 10; }
+
+// Use a `sycl_device` function as entry point.
+__attribute__((sycl_device)) int baraswap(int b) { return fooaswap(b); }
diff --git a/clang/test/CodeGenSYCL/sycl-libdevice-cmath.cpp b/clang/test/CodeGenSYCL/sycl-libdevice-cmath.cpp
deleted file mode 100644
index 5c282449dc851..0000000000000
--- a/clang/test/CodeGenSYCL/sycl-libdevice-cmath.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-// SYCL compilation uses libdevice in order to implement platform specific
-// versions of funcs like cosf, logf, etc. In order for the libdevice funcs
-// to be used, we need to make sure that llvm intrinsics such as llvm.cos.f32
-// are not emitted since many backends do not have lowerings for such
-// intrinsics. This allows the driver to link in the libdevice definitions for
-// cosf etc. later in the driver flow.
-
-// RUN: %clang_cc1 %s -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -fsycl-is-device -triple nvptx64-nvidia-cuda -ffast-math -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -fsycl-is-device -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -fsycl-is-device -triple amdgcn-amd-amdhsa -ffast-math -emit-llvm -o - | FileCheck %s
-
-#include "Inputs/sycl.hpp"
-
-extern "C" {
-float scalbnf(float x, int n);
-float logf(float x);
-float expf(float x);
-float frexpf(float x, int *exp);
-float ldexpf(float x, int exp);
-float log10f(float x);
-float modff(float x, float *intpart);
-float exp2f(float x);
-float expm1f(float x);
-int ilogbf(float x);
-float log1pf(float x);
-float log2f(float x);
-float logbf(float x);
-float sqrtf(float x);
-float cbrtf(float x);
-float hypotf(float x, float y);
-float erff(float x);
-float erfcf(float x);
-float tgammaf(float x);
-float lgammaf(float x);
-float fmodf(float x, float y);
-float remainderf(float x, float y);
-float remquof(float x, float y, int *q);
-float nextafterf(float x, float y);
-float fdimf(float x, float y);
-float fmaf(float x, float y, float z);
-float sinf(float x);
-float cosf(float x);
-float tanf(float x);
-float powf(float x, float y);
-float acosf(float x);
-float asinf(float x);
-float atanf(float x);
-float atan2f(float x, float y);
-float coshf(float x);
-float sinhf(float x);
-float tanhf(float x);
-float acoshf(float x);
-float asinhf(float x);
-float atanhf(float x);
-};
-
-// CHECK-NOT: llvm.abs.
-// CHECK-NOT: llvm.scalbnf.
-// CHECK-NOT: llvm.log.
-// CHECK-NOT: llvm.exp.
-// CHECK-NOT: llvm.frexp.
-// CHECK-NOT: llvm.ldexp.
-// CHECK-NOT: llvm.log10.
-// CHECK-NOT: llvm.mod.
-// CHECK-NOT: llvm.exp2.
-// CHECK-NOT: llvm.expm1.
-// CHECK-NOT: llvm.ilogb.
-// CHECK-NOT: llvm.log1p.
-// CHECK-NOT: llvm.log2.
-// CHECK-NOT: llvm.logb.
-// CHECK-NOT: llvm.sqrt.
-// CHECK-NOT: llvm.cbrt.
-// CHECK-NOT: llvm.hypot.
-// CHECK-NOT: llvm.erf.
-// CHECK-NOT: llvm.erfc.
-// CHECK-NOT: llvm.tgamma.
-// CHECK-NOT: llvm.lgamma.
-// CHECK-NOT: llvm.fmod.
-// CHECK-NOT: llvm.remainder.
-// CHECK-NOT: llvm.remquo.
-// CHECK-NOT: llvm.nextafter.
-// CHECK-NOT: llvm.fdim.
-// CHECK-NOT: llvm.fma.
-// CHECK-NOT: llvm.sin.
-// CHECK-NOT: llvm.cos.
-// CHECK-NOT: llvm.tan.
-// CHECK-NOT: llvm.pow.
-// CHECK-NOT: llvm.acos.
-// CHECK-NOT: llvm.asin.
-// CHECK-NOT: llvm.atan.
-// CHECK-NOT: llvm.atan2.
-// CHECK-NOT: llvm.cosh.
-// CHECK-NOT: llvm.sinh.
-// CHECK-NOT: llvm.tanh.
-// CHECK-NOT: llvm.acosh.
-// CHECK-NOT: llvm.asinh.
-// CHECK-NOT: llvm.atanh.
-void sycl_kernel(float *a, int *b) {
-  sycl::queue{}.submit([&](sycl::handler &cgh) {
-    cgh.single_task<class kernel>([=]() {
-      a[0] = scalbnf(a[0], b[0]);
-      a[0] = logf(a[0]);
-      a[0] = expf(a[0]);
-      a[0] = frexpf(a[0], b);
-      a[0] = ldexpf(a[0], b[0]);
-      a[0] = log10f(a[0]);
-      a[0] = modff(a[0], a);
-      a[0] = exp2f(a[0]);
-      a[0] = expm1f(a[0]);
-      a[0] = ilogbf(a[0]);
-      a[0] = log1pf(a[0]);
-      a[0] = log2f(a[0]);
-      a[0] = logbf(a[0]);
-      a[0] = sqrtf(a[0]);
-      a[0] = cbrtf(a[0]);
-      a[0] = hypotf(a[0], a[0]);
-      a[0] = erff(a[0]);
-      a[0] = erfcf(a[0]);
-      a[0] = tgammaf(a[0]);
-      a[0] = lgammaf(a[0]);
-      a[0] = fmodf(a[0], a[0]);
-      a[0] = remainderf(a[0], a[0]);
-      a[0] = remquof(a[0], a[0], b);
-      a[0] = nextafterf(a[0], a[0]);
-      a[0] = fdimf(a[0], a[0]);
-      a[0] = fmaf(a[0], a[0], a[0]);
-      a[0] = sinf(a[0]);
-      a[0] = cosf(a[0]);
-      a[0] = tanf(a[0]);
-      a[0] = powf(a[0], a[0]);
-      a[0] = acosf(a[0]);
-      a[0] = asinf(a[0]);
-      a[0] = atanf(a[0]);
-      a[0] = atan2f(a[0], a[0]);
-      a[0] = coshf(a[0]);
-      a[0] = sinhf(a[0]);
-      a[0] = tanhf(a[0]);
-      a[0] = acoshf(a[0]);
-      a[0] = asinhf(a[0]);
-      a[0] = atanhf(a[0]);
-    });
-  });
-}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index d04cf791e88bf..c23d9acb98826 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -190,6 +190,7 @@
 // CHECK-NEXT: SYCLDeviceGlobal (SubjectMatchRule_record)
 // CHECK-NEXT: SYCLDeviceHas (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLDeviceIndirectlyCallable (SubjectMatchRule_function)
+// CHECK-NEXT: SYCLDeviceOnly (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLGlobalVariableAllowed (SubjectMatchRule_record)
 // CHECK-NEXT: SYCLIntelDisableLoopPipelining (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelInitiationInterval (SubjectMatchRule_function)
diff --git a/libdevice/cmath_wrapper.cpp b/libdevice/cmath_wrapper.cpp
index d59395b2d0994..a084a86883767 100644
--- a/libdevice/cmath_wrapper.cpp
+++ b/libdevice/cmath_wrapper.cpp
@@ -8,8 +8,7 @@
 
 #include "device_math.h"
 
-#if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) ||           \
-    defined(__AMDGCN__)
+#if defined(__SPIR__) || defined(__SPIRV__)
 
 DEVICE_EXTERN_C_INLINE
 int abs(int x) { return __devicelib_abs(x); }
@@ -190,25 +189,7 @@ float asinhf(float x) { return __devicelib_asinhf(x); }
 DEVICE_EXTERN_C_INLINE
 float atanhf(float x) { return __devicelib_atanhf(x); }
 
-#ifdef __NVPTX__
-extern "C" SYCL_EXTERNAL float __nv_nearbyintf(float);
-DEVICE_EXTERN_C_INLINE
-float nearbyintf(float x) { return __nv_nearbyintf(x); }
-
-extern "C" SYCL_EXTERNAL float __nv_rintf(float);
-DEVICE_EXTERN_C_INLINE
-float rintf(float x) { return __nv_rintf(x); }
-#elif defined(__AMDGCN__)
-extern "C" SYCL_EXTERNAL float __ocml_nearbyint_f32(float);
-DEVICE_EXTERN_C_INLINE
-float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
-
-extern "C" SYCL_EXTERNAL float __ocml_rint_f32(float);
-DEVICE_EXTERN_C_INLINE
-float rintf(float x) { return __ocml_rint_f32(x); }
-#else
 DEVICE_EXTERN_C_INLINE
 float rintf(float x) { return __spirv_ocl_rint(x); }
-#endif
 
-#endif // __SPIR__ || __SPIRV__ || __NVPTX__ || __AMDGCN__
+#endif // __SPIR__ || __SPIRV__
diff --git a/libdevice/cmath_wrapper_fp64.cpp b/libdevice/cmath_wrapper_fp64.cpp
index 720982799ea71..855317bcf3f4b 100644
--- a/libdevice/cmath_wrapper_fp64.cpp
+++ b/libdevice/cmath_wrapper_fp64.cpp
@@ -9,8 +9,7 @@
 
 #include "device_math.h"
 
-#if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) ||           \
-    defined(__AMDGCN__)
+#if defined(__SPIR__) || defined(__SPIRV__)
 
 // All exported functions in math and complex device libraries are weak
 // reference. If users provide their own math or complex functions(with
@@ -180,26 +179,8 @@ double atanh(double x) { return __devicelib_atanh(x); }
 DEVICE_EXTERN_C_INLINE
 double scalbn(double x, int exp) { return __devicelib_scalbn(x, exp); }
 
-#ifdef __NVPTX__
-extern "C" SYCL_EXTERNAL double __nv_nearbyint(double);
-DEVICE_EXTERN_C_INLINE
-double nearbyint(double x) { return __nv_nearbyint(x); }
-
-extern "C" SYCL_EXTERNAL double __nv_rint(double);
-DEVICE_EXTERN_C_INLINE
-double rint(double x) { return __nv_rint(x); }
-#elif defined(__AMDGCN__)
-extern "C" SYCL_EXTERNAL double __ocml_nearbyint_f64(double);
-DEVICE_EXTERN_C_INLINE
-double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
-
-extern "C" SYCL_EXTERNAL double __ocml_rint_f64(double);
-DEVICE_EXTERN_C_INLINE
-double rint(double x) { return __ocml_rint_f64(x); }
-#else
 DEVICE_EXTERN_C_INLINE
 double rint(double x) { return __spirv_ocl_rint(x); }
-#endif
 
 #if defined(_MSC_VER)
 #include <math.h>
@@ -508,4 +489,4 @@ double _Sinh(double x, double y) { // compute y * sinh(x), |y| <= 1
   }
 }
 #endif // defined(_WIN32)
-#endif // __SPIR__ || __SPIRV__ || __NVPTX__ || __AMDGCN__
+#endif // __SPIR__ || __SPIRV__
diff --git a/libdevice/fallback-cmath.cpp b/libdevice/fallback-cmath.cpp
index 97cb4cf67b4c7..d930ea014ac24 100644
--- a/libdevice/fallback-cmath.cpp
+++ b/libdevice/fallback-cmath.cpp
@@ -8,8 +8,7 @@
 
 #include "device_math.h"
 
-#if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) ||           \
-    defined(__AMDGCN__)
+#if defined(__SPIR__) || defined(__SPIRV__)
 
 // To support fallback device libraries on-demand loading, please update the
 // DeviceLibFuncMap in llvm/tools/sycl-post-link/sycl-post-link.cpp if you add
@@ -215,4 +214,4 @@ float __devicelib_asinhf(float x) { return __spirv_ocl_asinh(x); }
 DEVICE_EXTERN_C_INLINE
 float __devicelib_atanhf(float x) { return __spirv_ocl_atanh(x); }
 
-#endif // __SPIR__ || __SPIRV__ || __NVPTX__ || __AMDGCN__
+#endif // __SPIR__ || __SPIRV__
diff --git a/sycl/include/sycl/stl_wrappers/__sycl_cmath_wrapper_impl.hpp b/sycl/include/sycl/stl_wrappers/__sycl_cmath_wrapper_impl.hpp
new file mode 100644
index 0000000000000..06741b2a1ea55
--- /dev/null
+++ b/sycl/include/sycl/stl_wrappers/__sycl_cmath_wrapper_impl.hpp
@@ -0,0 +1,541 @@
+//==------------- __sycl_cmath_wrapper_impl.hpp ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CMATH_FALLBACK_H__
+#define __CMATH_FALLBACK_H__
+
+// This header defines device-side overloads of <cmath> functions based on
+// their equivalent __spirv_ built-ins.
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+// The 'sycl_device_only' attribute enables device-side overloading.
+#define __DPCPP_SYCL_DEVICE __attribute__((sycl_device_only, always_inline))
+#define __DPCPP_SYCL_DEVICE_C                                                  \
+  extern "C" __attribute__((sycl_device_only, always_inline))
+
+// Promotion templates: the C++ standard library provides overloads that allow
+// arguments of math functions to be promoted. Any floating-point argument is
+// allowed to accept any integer type, which should then be promoted to double.
+// When multiple floating point arguments are available passing arguments with
+// different precision should promote to the larger type. The template helpers
+// below provide the machinery to define these promoting overloads.
+template <typename T, bool = (std::is_integral<T>::value ||
+                              std::is_floating_point<T>::value)>
+struct __dpcpp_promote {
+private:
+  // Integer types are promoted to double.
+  template <typename U>
+  static typename std::enable_if<std::is_integral<U>::value, double>::type
+  test();
+
+  // Floating point types are used as-is.
+  template <typename U>
+  static typename std::enable_if<std::is_floating_point<U>::value, U>::type
+  test();
+
+public:
+  // We rely on dummy templated methods and decltype to select the right type
+  // based on the input T.
+  typedef decltype(test<T>()) type;
+};
+
+// Variant without ::type to allow SFINAE for non-promotable types.
+template <typename T> struct __dpcpp_promote<T, false> {};
+
+// With a single paramter we only need to promote integers.
+template <typename T>
+using __dpcpp_promote_1 = std::enable_if<std::is_integral<T>::value, double>;
+
+// With two or three parameters we need to promote integers and possibly
+// floating point types. We rely on operator+ with decltype to deduce the
+// overall promotion type. This is only needed if at least one of the parameter
+// is an integer, or if there's multiple different floating point types.
+template <typename T, typename U>
+using __dpcpp_promote_2 =
+    std::enable_if<!std::is_same<T, U>::value || std::is_integral<T>::value ||
+                       std::is_integral<U>::value,
+                   decltype(typename __dpcpp_promote<T>::type(0) +
+                            typename __dpcpp_promote<U>::type(0))>;
+
+template <typename T, typename U, typename V>
+using __dpcpp_promote_3 =
+    std::enable_if<!(std::is_same<T, U>::value && std::is_same<U, V>::value) ||
+                       std::is_integral<T>::value ||
+                       std::is_integral<U>::value || std::is_integral<V>::value,
+                   decltype(typename __dpcpp_promote<T>::type(0) +
+                            typename __dpcpp_promote<U>::type(0) +
+                            typename __dpcpp_promote<V>::type(0))>;
+
+// For each math built-in we need to define float and double overloads, an
+// extern "C" float variant with the 'f' suffix, and a version that promotes
+// integers or mixed precision floating-point parameters.
+//
+// TODO: Consider targets that don't have double support.
+// TODO: Enable long double support where possible.
+//
+// The following two macros provide an easy way to define these overloads for
+// basic built-ins with one or two floating-point parameters.
+#define __DPCPP_SPIRV_MAP_UNARY(NAME)                                          \
+  __DPCPP_SYCL_DEVICE_C float NAME##f(float x) {                               \
+    return __spirv_ocl_##NAME(x);                                              \
+  }                                                                            \
+  __DPCPP_SYCL_DEVICE float NAME(float x) { return __spirv_ocl_##NAME(x); }    \
+  __DPCPP_SYCL_DEVICE double NAME(double x) { return __spirv_ocl_##NAME(x); }  \
+  template <typename T>                                                        \
+  __DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type NAME(T x) {          \
+    return __spirv_ocl_##NAME((double)x);                                      \
+  }
+
+#define __DPCPP_SPIRV_MAP_BINARY(NAME)                                         \
+  __DPCPP_SYCL_DEVICE_C float NAME##f(float x, float y) {                      \
+    return __spirv_ocl_##NAME(x, y);                                           \
+  }                                                                            \
+  __DPCPP_SYCL_DEVICE float NAME(float x, float y) {                           \
+    return __spirv_ocl_##NAME(x, y);                                           \
+  }                                                                            \
+  __DPCPP_SYCL_DEVICE double NAME(double x, double y) {                        \
+    return __spirv_ocl_##NAME(x, y);                                           \
+  }                                                                            \
+  template <typename T, typename U>                                            \
+  __DPCPP_SYCL_DEVICE __dpcpp_promote_2<T, U>::type NAME(T x, U y) {           \
+    typedef typename __dpcpp_promote_2<T, U>::type type;                       \
+    return __spirv_ocl_##NAME((type)x, (type)y);                               \
+  }
+
+/// <cstdlib>
+// FIXME: Move this to a cstdlib fallback header.
+
+__DPCPP_SYCL_DEVICE div_t div(int x, int y) { return {x / y, x % y}; }
+__DPCPP_SYCL_DEVICE ldiv_t ldiv(long x, long y) { return {x / y, x % y}; }
+__DPCPP_SYCL_DEVICE lldiv_t ldiv(long long x, long long y) {
+  return {x / y, x % y};
+}
+
+__DPCPP_SYCL_DEVICE long long abs(long long n) { return n < 0 ? -n : n; }
+__DPCPP_SYCL_DEVICE_C long long llabs(long long n) { return n < 0 ? -n : n; }
+__DPCPP_SYCL_DEVICE long abs(long n) { return n < 0 ? -n : n; }
+__DPCPP_SYCL_DEVICE int abs(int n) { return n < 0 ? -n : n; }
+__DPCPP_SYCL_DEVICE_C long labs(long n) { return n < 0 ? -n : n; }
+
+/// Basic operations
+//
+
+__DPCPP_SYCL_DEVICE float abs(float x) { return x < 0 ? -x : x; }
+__DPCPP_SYCL_DEVICE double abs(double x) { return x < 0 ? -x : x; }
+__DPCPP_SYCL_DEVICE float fabs(float x) { return x < 0 ? -x : x; }
+__DPCPP_SYCL_DEVICE_C float fabsf(float x) { return x < 0 ? -x : x; }
+__DPCPP_SYCL_DEVICE double fabs(double x) { return x < 0 ? -x : x; }
+template <typename T>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type fabs(T x) {
+  return x < 0 ? -x : x;
+}
+
+__DPCPP_SPIRV_MAP_BINARY(fmod);
+__DPCPP_SPIRV_MAP_BINARY(remainder);
+
+__DPCPP_SYCL_DEVICE_C float remquof(float x, float y, int *q) {
+  return __spirv_ocl_remquo(x, y, q);
+}
+__DPCPP_SYCL_DEVICE float remquo(float x, float y, int *q) {
+  return __spirv_ocl_remquo(x, y, q);
+}
+__DPCPP_SYCL_DEVICE double remquo(double x, double y, int *q) {
+  return __spirv_ocl_remquo(x, y, q);
+}
+template <typename T, typename U>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_2<T, U>::type remquo(T x, U y,
+                                                                  int *q) {
+  typedef typename __dpcpp_promote_2<T, U>::type type;
+  return __spirv_ocl_remquo((type)x, (type)y, q);
+}
+
+__DPCPP_SYCL_DEVICE_C float fmaf(float x, float y, float z) {
+  return __spirv_ocl_fma(x, y, z);
+}
+__DPCPP_SYCL_DEVICE float fma(float x, float y, float z) {
+  return __spirv_ocl_fma(x, y, z);
+}
+__DPCPP_SYCL_DEVICE double fma(double x, double y, double z) {
+  return __spirv_ocl_fma(x, y, z);
+}
+template <typename T, typename U, typename V>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_3<T, U, V>::type fma(T x, U y,
+                                                                  V z) {
+  typedef typename __dpcpp_promote_3<T, U, V>::type type;
+  return __spirv_ocl_fma((type)x, (type)y, (type)z);
+}
+
+__DPCPP_SPIRV_MAP_BINARY(fmax);
+__DPCPP_SPIRV_MAP_BINARY(fmin);
+__DPCPP_SPIRV_MAP_BINARY(fdim);
+// unsupported: nan
+
+/// Exponential functions
+//
+
+__DPCPP_SPIRV_MAP_UNARY(exp);
+__DPCPP_SPIRV_MAP_UNARY(exp2);
+__DPCPP_SPIRV_MAP_UNARY(expm1);
+__DPCPP_SPIRV_MAP_UNARY(log);
+__DPCPP_SPIRV_MAP_UNARY(log10);
+__DPCPP_SPIRV_MAP_UNARY(log2);
+__DPCPP_SPIRV_MAP_UNARY(log1p);
+
+/// Power functions
+//
+
+__DPCPP_SPIRV_MAP_BINARY(pow);
+__DPCPP_SPIRV_MAP_UNARY(sqrt);
+__DPCPP_SPIRV_MAP_UNARY(cbrt);
+__DPCPP_SPIRV_MAP_BINARY(hypot);
+
+/// Trigonometric functions
+//
+
+__DPCPP_SPIRV_MAP_UNARY(sin);
+__DPCPP_SPIRV_MAP_UNARY(cos);
+__DPCPP_SPIRV_MAP_UNARY(tan);
+__DPCPP_SPIRV_MAP_UNARY(asin);
+__DPCPP_SPIRV_MAP_UNARY(acos);
+__DPCPP_SPIRV_MAP_UNARY(atan);
+__DPCPP_SPIRV_MAP_BINARY(atan2);
+
+/// Hyperbolic functions
+//
+
+__DPCPP_SPIRV_MAP_UNARY(sinh);
+__DPCPP_SPIRV_MAP_UNARY(cosh);
+__DPCPP_SPIRV_MAP_UNARY(tanh);
+__DPCPP_SPIRV_MAP_UNARY(asinh);
+__DPCPP_SPIRV_MAP_UNARY(acosh);
+__DPCPP_SPIRV_MAP_UNARY(atanh);
+
+/// Error and gamma functions
+//
+
+__DPCPP_SPIRV_MAP_UNARY(erf);
+__DPCPP_SPIRV_MAP_UNARY(erfc);
+__DPCPP_SPIRV_MAP_UNARY(tgamma);
+__DPCPP_SPIRV_MAP_UNARY(lgamma);
+
+/// Nearest integer floating-point operations
+//
+
+__DPCPP_SPIRV_MAP_UNARY(ceil);
+__DPCPP_SPIRV_MAP_UNARY(floor);
+__DPCPP_SPIRV_MAP_UNARY(trunc);
+__DPCPP_SPIRV_MAP_UNARY(round);
+// unsupported: lround, llround (no spirv mapping)
+__DPCPP_SPIRV_MAP_UNARY(rint);
+// unsupported: lrint, llrint (no spirv mapping)
+
+// unsupported (partially, no spirv mapping): nearbyint
+#if defined(__NVPTX__)
+extern "C" SYCL_EXTERNAL float __nv_nearbyintf(float);
+extern "C" SYCL_EXTERNAL double __nv_nearbyint(double);
+__DPCPP_SYCL_DEVICE_C float nearbyintf(float x) { return __nv_nearbyintf(x); }
+__DPCPP_SYCL_DEVICE float nearbyint(float x) { return __nv_nearbyintf(x); }
+__DPCPP_SYCL_DEVICE double nearbyint(double x) { return __nv_nearbyintf(x); }
+#elif defined(__AMDGCN__)
+extern "C" SYCL_EXTERNAL float __ocml_nearbyint_f32(float);
+extern "C" SYCL_EXTERNAL double __ocml_nearbyint_f64(double);
+__DPCPP_SYCL_DEVICE_C float nearbyintf(float x) {
+  return __ocml_nearbyint_f32(x);
+}
+__DPCPP_SYCL_DEVICE float nearbyint(float x) { return __ocml_nearbyint_f32(x); }
+__DPCPP_SYCL_DEVICE double nearbyint(double x) {
+  return __ocml_nearbyint_f64(x);
+}
+#endif
+
+/// Floating-point manipulation functions
+//
+
+__DPCPP_SYCL_DEVICE_C float frexpf(float x, int *exp) {
+  return __spirv_ocl_frexp(x, exp);
+}
+__DPCPP_SYCL_DEVICE float frexp(float x, int *exp) {
+  return __spirv_ocl_frexp(x, exp);
+}
+__DPCPP_SYCL_DEVICE double frexp(double x, int *exp) {
+  return __spirv_ocl_frexp(x, exp);
+}
+template <typename T>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type frexp(T x, int *exp) {
+  return __spirv_ocl_frexp((double)x, exp);
+}
+
+__DPCPP_SYCL_DEVICE_C float ldexpf(float x, int exp) {
+  return __spirv_ocl_ldexp(x, exp);
+}
+__DPCPP_SYCL_DEVICE float ldexp(float x, int exp) {
+  return __spirv_ocl_ldexp(x, exp);
+}
+__DPCPP_SYCL_DEVICE double ldexp(double x, int exp) {
+  return __spirv_ocl_ldexp(x, exp);
+}
+template <typename T>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type ldexp(T x, int exp) {
+  return __spirv_ocl_ldexp((double)x, exp);
+}
+
+__DPCPP_SYCL_DEVICE_C float modff(float x, float *intpart) {
+  return __spirv_ocl_modf(x, intpart);
+}
+__DPCPP_SYCL_DEVICE float modf(float x, float *intpart) {
+  return __spirv_ocl_modf(x, intpart);
+}
+__DPCPP_SYCL_DEVICE double modf(double x, double *intpart) {
+  return __spirv_ocl_modf(x, intpart);
+}
+// modf only supports integer x when the intpart is double.
+template <typename T>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type modf(T x,
+                                                             double *intpart) {
+  return __spirv_ocl_modf((double)x, intpart);
+}
+
+__DPCPP_SYCL_DEVICE_C float scalbnf(float x, int exp) {
+  return __spirv_ocl_ldexp(x, exp);
+}
+__DPCPP_SYCL_DEVICE float scalbn(float x, int exp) {
+  return __spirv_ocl_ldexp(x, exp);
+}
+__DPCPP_SYCL_DEVICE double scalbn(double x, int exp) {
+  return __spirv_ocl_ldexp(x, exp);
+}
+template <typename T>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type scalbn(T x, int exp) {
+  return __spirv_ocl_ldexp((double)x, exp);
+}
+
+__DPCPP_SYCL_DEVICE_C float scalblnf(float x, long exp) {
+  return __spirv_ocl_ldexp(x, (int)exp);
+}
+__DPCPP_SYCL_DEVICE float scalbln(float x, long exp) {
+  return __spirv_ocl_ldexp(x, (int)exp);
+}
+__DPCPP_SYCL_DEVICE double scalbln(double x, long exp) {
+  return __spirv_ocl_ldexp(x, (int)exp);
+}
+template <typename T>
+__DPCPP_SYCL_DEVICE typename __dpcpp_promote_1<T>::type scalbln(T x, long exp) {
+  return __spirv_ocl_ldexp((double)x, (int)exp);
+}
+
+__DPCPP_SYCL_DEVICE_C int ilogbf(float x) { return __spirv_ocl_ilogb(x); }
+__DPCPP_SYCL_DEVICE int ilogb(float x) { return __spirv_ocl_ilogb(x); }
+__DPCPP_SYCL_DEVICE int ilogb(double x) { return __spirv_ocl_ilogb(x); }
+// ilogb needs a special template since its signature doesn't include the
+// promoted type anywhere, so it needs to be specialized differently.
+template <typename T, typename std::enable_if<std::is_integral<T>::value,
+                                              bool>::type = true>
+__DPCPP_SYCL_DEVICE int ilogb(T x) {
+  return __spirv_ocl_ilogb((double)x);
+}
+
+__DPCPP_SPIRV_MAP_UNARY(logb);
+__DPCPP_SPIRV_MAP_BINARY(nextafter);
+// unsupported: nextforward
+__DPCPP_SPIRV_MAP_BINARY(copysign);
+
+/// Classification and comparison
+//
+
+// unsupported: fpclassify
+// unsupported: isfinite
+// unsupported: isinf
+// unsupported: isnan
+// unsupported: isnormal
+// unsupported: signbit
+// unsupported: isgreater
+// unsupported: isgreaterequal
+// unsupported: isless
+// unsupported: islessequal
+// unsupported: islessgreated
+// unsupported: isunordered
+
+// Now drag all of the overloads we've just defined in the std namespace. For
+// the overloads to work properly we need to ensure our namespace matches
+// exactly the one of the system C++ library.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// <cstdlib>
+using ::div;
+using ::labs;
+using ::ldiv;
+using ::llabs;
+using ::lldiv;
+
+// Basic operations
+using ::abs;
+using ::fabs;
+using ::fabsf;
+using ::fdim;
+using ::fdimf;
+using ::fma;
+using ::fmaf;
+using ::fmax;
+using ::fmaxf;
+using ::fmin;
+using ::fminf;
+using ::fmod;
+using ::fmodf;
+using ::remainder;
+using ::remainderf;
+using ::remquo;
+using ::remquof;
+// using ::nan;
+// using ::nanf;
+
+// Exponential functions
+using ::exp;
+using ::exp2;
+using ::exp2f;
+using ::expf;
+using ::expm1;
+using ::expm1f;
+using ::log;
+using ::log10;
+using ::log10f;
+using ::log1p;
+using ::log1pf;
+using ::log2;
+using ::log2f;
+using ::logf;
+
+// Power functions
+using ::cbrt;
+using ::cbrtf;
+using ::hypot;
+using ::hypotf;
+using ::pow;
+using ::powf;
+using ::sqrt;
+using ::sqrtf;
+
+// Trigonometric functions
+using ::acos;
+using ::acosf;
+using ::asin;
+using ::asinf;
+using ::atan;
+using ::atan2;
+using ::atan2f;
+using ::atanf;
+using ::cos;
+using ::cosf;
+using ::sin;
+using ::sinf;
+using ::tan;
+using ::tanf;
+
+// Hyperbloic functions
+using ::acosh;
+using ::acoshf;
+using ::asinh;
+using ::asinhf;
+using ::atanh;
+using ::atanhf;
+using ::cosh;
+using ::coshf;
+using ::sinh;
+using ::sinhf;
+using ::tanh;
+using ::tanhf;
+
+// Error and gamma functions
+using ::erf;
+using ::erfc;
+using ::erfcf;
+using ::erff;
+using ::lgamma;
+using ::lgammaf;
+using ::tgamma;
+using ::tgammaf;
+
+// Nearest integer floating-point operations
+using ::ceil;
+using ::ceilf;
+using ::floor;
+using ::floorf;
+using ::round;
+using ::roundf;
+using ::trunc;
+using ::truncf;
+// using ::lround;
+// using ::llround;
+using ::rint;
+using ::rintf;
+// using ::lrint;
+// using ::llrint;
+
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+using ::nearbyint;
+using ::nearbyintf;
+#endif
+
+// Floating-point manipulation functions
+using ::frexp;
+using ::frexpf;
+using ::ilogb;
+using ::ilogbf;
+using ::ldexp;
+using ::ldexpf;
+using ::logb;
+using ::logbf;
+using ::modf;
+using ::modff;
+using ::nextafter;
+using ::nextafterf;
+using ::scalbln;
+using ::scalblnf;
+using ::scalbn;
+using ::scalbnf;
+// using ::nextforward
+// using ::nextforwardf
+using ::copysign;
+using ::copysignf;
+
+// Classification and comparison
+// using ::fpclassify;
+// using ::isfinite;
+// using ::isgreater;
+// using ::isgreaterequal;
+// using ::isinf;
+// using ::isless;
+// using ::islessequal;
+// using ::islessgreater;
+// using ::isnan;
+// using ::isnormal;
+// using ::isunordered;
+// using ::signbit;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#undef __DPCPP_SPIRV_MAP_BINARY
+#undef __DPCPP_SPIRV_MAP_UNARY
+#undef __DPCPP_SYCL_DEVICE_C
+#undef __DPCPP_SYCL_DEVICE
+#endif
+#endif
diff --git a/sycl/include/sycl/stl_wrappers/cmath b/sycl/include/sycl/stl_wrappers/cmath
index c25eadf6394a1..14e64c29c9c0a 100644
--- a/sycl/include/sycl/stl_wrappers/cmath
+++ b/sycl/include/sycl/stl_wrappers/cmath
@@ -1,3 +1,11 @@
+//==------------------------ cmath -----------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 // Include real STL <cmath> header - the next one from the include search
@@ -16,6 +24,10 @@
 
 // *** <sycl/builtins.hpp> ***
 
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+#include "__sycl_cmath_wrapper_impl.hpp"
+#endif
+
 #include <sycl/detail/defines_elementary.hpp>
 
 #ifdef __SYCL_DEVICE_ONLY__
@@ -105,6 +117,8 @@ extern __DPCPP_SYCL_EXTERNAL double atanh(double x);
 extern __DPCPP_SYCL_EXTERNAL double frexp(double x, int *exp);
 extern __DPCPP_SYCL_EXTERNAL double ldexp(double x, int exp);
 extern __DPCPP_SYCL_EXTERNAL double hypot(double x, double y);
+extern __DPCPP_SYCL_EXTERNAL float rintf(float x);
+extern __DPCPP_SYCL_EXTERNAL double rint(double x);
 
 }
 #ifdef __GLIBC__
diff --git a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp
index bda9ce2ff1ced..833b3aea6203a 100644
--- a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp
+++ b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp
@@ -21,13 +21,13 @@ namespace s = sycl;
 constexpr s::access::mode sycl_read = s::access::mode::read;
 constexpr s::access::mode sycl_write = s::access::mode::write;
 
-#define TEST_NUM 74
+#define TEST_NUM 75
 
 double ref[TEST_NUM] = {
-    6, 100, 0.5, 1.0, 0, 0, -2, 1, 2, 1,   1,   1,   0,   1, 1, 0, 0, 0, 0,
-    0, 1,   1,   0.5, 0, 2, 0,  0, 1, 0,   2,   0,   0,   0, 0, 0, 1, 0, 1,
-    2, 0,   1,   2,   5, 0, 0,  0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, 0, 0, 0,
-    0, 0,   0,   0,   0, 0, 0,  0, 0, 0,   0,   0,   0,   0, 0, 0, 0};
+    1.0, 6, 100, 0.5, 1.0, 0, 0, -2, 1, 2, 1,   1,   1,   0,   1, 1, 0, 0, 0,
+    0,   0, 1,   1,   0.5, 0, 2, 0,  0, 1, 0,   2,   0,   0,   0, 0, 0, 1, 0,
+    1,   2, 0,   1,   2,   5, 0, 0,  0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, 0, 0,
+    0,   0, 0,   0,   0,   0, 0, 0,  0, 0, 0,   0,   0,   0,   0, 0, 0, 0};
 
 double refIptr = 1;
 
@@ -62,6 +62,8 @@ template <class T> void device_cmath_test(s::queue &deviceQueue) {
         T minus_infinity = -INFINITY;
         double subnormal;
         *((uint64_t *)&subnormal) = 0xFFFFFFFFFFFFFULL;
+
+        res_access[i++] = std::rint(0.9);
         res_access[i++] = std::scalbln(1.5, 2);
         res_access[i++] = sycl::exp10(2.0);
         res_access[i++] = sycl::rsqrt(4.0);
diff --git a/sycl/test-e2e/DeviceLib/cmath_test.cpp b/sycl/test-e2e/DeviceLib/cmath_test.cpp
index 6a453e56f704f..c5c58f09023d4 100644
--- a/sycl/test-e2e/DeviceLib/cmath_test.cpp
+++ b/sycl/test-e2e/DeviceLib/cmath_test.cpp
@@ -24,13 +24,13 @@ namespace s = sycl;
 constexpr s::access::mode sycl_read = s::access::mode::read;
 constexpr s::access::mode sycl_write = s::access::mode::write;
 
-#define TEST_NUM 70
+#define TEST_NUM 71
 
-float ref[TEST_NUM] = {100, 0.5, 1.0, 0,   0,   -2, 1,   2, 1, 1, 0, 1, 1, 0,
-                       0,   0,   0,   0,   1,   1,  0.5, 0, 0, 1, 0, 2, 0, 0,
-                       0,   0,   0,   1,   0,   1,  2,   0, 1, 2, 5, 0, 0, 0,
-                       0,   0.5, 0.5, NAN, NAN, 2,  0,   0, 0, 0, 0, 0, 0, 0,
-                       0,   0,   0,   0,   0,   0,  0,   0, 0, 0, 0, 0, 0, 0};
+float ref[TEST_NUM] = {1.0f, 100, 0.5, 1.0, 0, 0, -2,  1, 2, 1, 1, 0, 1, 1, 0,
+                       0,    0,   0,   0,   1, 1, 0.5, 0, 0, 1, 0, 2, 0, 0, 0,
+                       0,    0,   1,   0,   1, 2, 0,   1, 2, 5, 0, 0, 0, 0, 0.5,
+                       0.5,  NAN, NAN, 2,   0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,
+                       0,    0,   0,   0,   0, 0, 0,   0, 0, 0, 0};
 
 float refIptr = 1;
 
@@ -60,6 +60,7 @@ template <class T> void device_cmath_test_1(s::queue &deviceQueue) {
         float subnormal;
         *((uint32_t *)&subnormal) = 0x7FFFFF;
 
+        res_access[i++] = std::rint(0.9f);
         res_access[i++] = sycl::exp10(2.0f);
         res_access[i++] = sycl::rsqrt(4.0f);
         res_access[i++] = std::trunc(1.2f);
diff --git a/sycl/test-e2e/DeviceLib/math_fp64_test.cpp b/sycl/test-e2e/DeviceLib/math_fp64_test.cpp
index 86dad1b3c0d3f..5f160b9f2a7b1 100644
--- a/sycl/test-e2e/DeviceLib/math_fp64_test.cpp
+++ b/sycl/test-e2e/DeviceLib/math_fp64_test.cpp
@@ -20,12 +20,12 @@ namespace s = sycl;
 constexpr s::access::mode sycl_read = s::access::mode::read;
 constexpr s::access::mode sycl_write = s::access::mode::write;
 
-#define TEST_NUM 61
+#define TEST_NUM 62
 
 double ref_val[TEST_NUM] = {
-    1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 2, 0, 0,   1,   0,   2,   0, 0, 0, 0,
-    0, 1, 0, 1, 2, 0, 1, 2, 5, 0,   0, 0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0,   0,   0,   0,   0, 0};
+    1.0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 2, 0, 0,   1,   0,   2,   0, 0, 0,
+    0,   0, 1, 0, 1, 2, 0, 1, 2, 5, 0,   0, 0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0,
+    0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0,   0,   0,   0,   0, 0};
 
 double refIptr = 1;
 
@@ -61,6 +61,7 @@ void device_math_test(s::queue &deviceQueue) {
         double subnormal;
         *((uint64_t *)&subnormal) = 0xFFFFFFFFFFFFFULL;
 
+        res_access[i++] = rint(0.9);
         res_access[i++] = cos(0.0);
         res_access[i++] = sin(0.0);
         res_access[i++] = log(1.0);
diff --git a/sycl/test-e2e/DeviceLib/math_test.cpp b/sycl/test-e2e/DeviceLib/math_test.cpp
index 029409b617473..c9a98f468225d 100644
--- a/sycl/test-e2e/DeviceLib/math_test.cpp
+++ b/sycl/test-e2e/DeviceLib/math_test.cpp
@@ -18,12 +18,12 @@ namespace s = sycl;
 constexpr s::access::mode sycl_read = s::access::mode::read;
 constexpr s::access::mode sycl_write = s::access::mode::write;
 
-#define TEST_NUM 59
+#define TEST_NUM 60
 
 float ref_val[TEST_NUM] = {
-    1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 0, 1,   0,   2,   0,   0, 0, 0, 0,
-    1, 0, 1, 2, 0, 1, 2, 5, 0, 0,   0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0,   0,   0,   0,   0, 0, 0};
+    1.0f, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 0, 1,   0,   2,   0,   0, 0, 0,
+    0,    1, 0, 1, 2, 0, 1, 2, 5, 0, 0,   0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0,   0,   0,   0,   0, 0, 0};
 
 float refIptr = 1;
 
@@ -53,6 +53,7 @@ void device_math_test(s::queue &deviceQueue) {
         float subnormal;
         *((uint32_t *)&subnormal) = 0x7FFFFF;
 
+        res_access[i++] = rintf(0.9);
         res_access[i++] = cosf(0.0f);
         res_access[i++] = sinf(0.0f);
         res_access[i++] = logf(1.0f);
diff --git a/sycl/test/check_device_code/math-builtins/cmath-fallback.cpp b/sycl/test/check_device_code/math-builtins/cmath-fallback.cpp
new file mode 100644
index 0000000000000..1da771bfc1861
--- /dev/null
+++ b/sycl/test/check_device_code/math-builtins/cmath-fallback.cpp
@@ -0,0 +1,481 @@
+// REQUIRES: cuda
+// Note: This isn't really target specific and should be switched to spir when
+// it's enabled for it.
+
+// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm -fsycl-device-only %s -o - | FileCheck %s
+
+#include <cmath>
+
+// CHECK-LABEL: entry
+__attribute__((sycl_device)) void entry(float *fp, double *dp, int *ip,
+                                        long *lp, long long *llp, float *rf,
+                                        double *rd, int *ri) {
+  // Use an incrementing index to prevent the compiler from optimizing some
+  // calls that would store to the same address.
+  int idx = 0;
+
+  // For each supported standard math built-in, we test that the following
+  // overloads are properly mapped to __spirv_ built-ins:
+  //
+  // * Float only.
+  // * Float only with 'f' suffix.
+  // * Double only.
+  // * Integer promotion.
+  // * Mixed floating point promotion (when applicable).
+  //
+  // CHECK: __spirv_ocl_fmodff
+  rf[idx++] = std::fmod(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_fmodff
+  rf[idx++] = std::fmodf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_fmoddd
+  rd[idx++] = std::fmod(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_fmoddd
+  rd[idx++] = std::fmod(fp[0], ip[1]);
+  // CHECK: __spirv_ocl_fmoddd
+  rd[idx++] = std::fmod(fp[0], dp[1]);
+
+  // CHECK: __spirv_ocl_remainderff
+  rf[idx++] = std::remainder(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_remainderff
+  rf[idx++] = std::remainderf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_remainderdd
+  rd[idx++] = std::remainder(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_remainderdd
+  rd[idx++] = std::remainder(fp[0], ip[1]);
+  // CHECK: __spirv_ocl_remainderdd
+  rd[idx++] = std::remainder(fp[0], dp[1]);
+
+  // CHECK: __spirv_ocl_remquoff
+  rf[idx++] = std::remquo(fp[0], fp[1], ip);
+  // CHECK: __spirv_ocl_remquoff
+  rf[idx++] = std::remquof(fp[2], fp[1], ip);
+  // CHECK: __spirv_ocl_remquodd
+  rd[idx++] = std::remquo(dp[0], dp[1], ip);
+  // CHECK: __spirv_ocl_remquodd
+  rd[idx++] = std::remquo(fp[0], ip[1], ip);
+  // CHECK: __spirv_ocl_remquodd
+  rd[idx++] = std::remquo(fp[0], dp[1], ip);
+
+  // CHECK: __spirv_ocl_fmaff
+  rf[idx++] = std::fma(fp[0], fp[1], fp[2]);
+  // CHECK: __spirv_ocl_fmaff
+  rf[idx++] = std::fmaf(fp[3], fp[1], fp[2]);
+  // CHECK: __spirv_ocl_fmadd
+  rd[idx++] = std::fma(dp[0], dp[1], dp[2]);
+  // CHECK: __spirv_ocl_fmadd
+  rd[idx++] = std::fma(fp[0], ip[1], fp[2]);
+  // CHECK: __spirv_ocl_fmadd
+  rd[idx++] = std::fma(fp[0], dp[1], fp[2]);
+
+  // CHECK: __spirv_ocl_fmaxff
+  rf[idx++] = std::fmax(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_fmaxff
+  rf[idx++] = std::fmaxf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_fmaxdd
+  rd[idx++] = std::fmax(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_fmaxdd
+  rd[idx++] = std::fmax(fp[0], ip[1]);
+  // CHECK: __spirv_ocl_fmaxdd
+  rd[idx++] = std::fmax(fp[0], dp[1]);
+
+  // CHECK: __spirv_ocl_fminff
+  rf[idx++] = std::fmin(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_fminff
+  rf[idx++] = std::fminf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_fmindd
+  rd[idx++] = std::fmin(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_fmindd
+  rd[idx++] = std::fmin(fp[0], ip[1]);
+  // CHECK: __spirv_ocl_fmindd
+  rd[idx++] = std::fmin(fp[0], dp[1]);
+
+  // CHECK: __spirv_ocl_fdimff
+  rf[idx++] = std::fdim(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_fdimff
+  rf[idx++] = std::fdimf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_fdimdd
+  rd[idx++] = std::fdim(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_fdimdd
+  rd[idx++] = std::fdim(fp[0], ip[1]);
+  // CHECK: __spirv_ocl_fdimdd
+  rd[idx++] = std::fdim(fp[0], dp[1]);
+
+  // CHECK: __spirv_ocl_expf
+  rf[idx++] = std::exp(fp[0]);
+  // CHECK: __spirv_ocl_expf
+  rf[idx++] = std::expf(fp[1]);
+  // CHECK: __spirv_ocl_expd
+  rd[idx++] = std::exp(dp[0]);
+  // CHECK: __spirv_ocl_expd
+  rd[idx++] = std::exp(ip[0]);
+
+  // CHECK: __spirv_ocl_exp2f
+  rf[idx++] = std::exp2(fp[0]);
+  // CHECK: __spirv_ocl_exp2f
+  rf[idx++] = std::exp2f(fp[1]);
+  // CHECK: __spirv_ocl_exp2d
+  rd[idx++] = std::exp2(dp[0]);
+  // CHECK: __spirv_ocl_exp2d
+  rd[idx++] = std::exp2(ip[0]);
+
+  // CHECK: __spirv_ocl_expm1f
+  rf[idx++] = std::expm1(fp[0]);
+  // CHECK: __spirv_ocl_expm1f
+  rf[idx++] = std::expm1f(fp[1]);
+  // CHECK: __spirv_ocl_expm1d
+  rd[idx++] = std::expm1(dp[0]);
+  // CHECK: __spirv_ocl_expm1d
+  rd[idx++] = std::expm1(ip[0]);
+
+  // CHECK: __spirv_ocl_logf
+  rf[idx++] = std::log(fp[0]);
+  // CHECK: __spirv_ocl_logf
+  rf[idx++] = std::logf(fp[1]);
+  // CHECK: __spirv_ocl_logd
+  rd[idx++] = std::log(dp[0]);
+  // CHECK: __spirv_ocl_logd
+  rd[idx++] = std::log(ip[0]);
+
+  // CHECK: __spirv_ocl_log10f
+  rf[idx++] = std::log10(fp[0]);
+  // CHECK: __spirv_ocl_log10f
+  rf[idx++] = std::log10f(fp[1]);
+  // CHECK: __spirv_ocl_log10d
+  rd[idx++] = std::log10(dp[0]);
+  // CHECK: __spirv_ocl_log10d
+  rd[idx++] = std::log10(ip[0]);
+
+  // CHECK: __spirv_ocl_log2f
+  rf[idx++] = std::log2(fp[0]);
+  // CHECK: __spirv_ocl_log2f
+  rf[idx++] = std::log2f(fp[1]);
+  // CHECK: __spirv_ocl_log2d
+  rd[idx++] = std::log2(dp[0]);
+  // CHECK: __spirv_ocl_log2d
+  rd[idx++] = std::log2(ip[0]);
+
+  // CHECK: __spirv_ocl_log1pf
+  rf[idx++] = std::log1p(fp[0]);
+  // CHECK: __spirv_ocl_log1pf
+  rf[idx++] = std::log1pf(fp[1]);
+  // CHECK: __spirv_ocl_log1pd
+  rd[idx++] = std::log1p(dp[0]);
+  // CHECK: __spirv_ocl_log1pd
+  rd[idx++] = std::log1p(ip[0]);
+
+  // CHECK: __spirv_ocl_powff
+  rf[idx++] = std::pow(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_powff
+  rf[idx++] = std::powf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_powdd
+  rd[idx++] = std::pow(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_powdd
+  rd[idx++] = std::pow(ip[0], fp[1]);
+  // CHECK: __spirv_ocl_powdd
+  rd[idx++] = std::pow(dp[0], fp[1]);
+
+  // CHECK: __spirv_ocl_sqrtf
+  rf[idx++] = std::sqrt(fp[0]);
+  // CHECK: __spirv_ocl_sqrtf
+  rf[idx++] = std::sqrtf(fp[1]);
+  // CHECK: __spirv_ocl_sqrtd
+  rd[idx++] = std::sqrt(dp[0]);
+  // CHECK: __spirv_ocl_sqrtd
+  rd[idx++] = std::sqrt(ip[0]);
+
+  // CHECK: __spirv_ocl_cbrtf
+  rf[idx++] = std::cbrt(fp[0]);
+  // CHECK: __spirv_ocl_cbrtf
+  rf[idx++] = std::cbrtf(fp[1]);
+  // CHECK: __spirv_ocl_cbrtd
+  rd[idx++] = std::cbrt(dp[0]);
+  // CHECK: __spirv_ocl_cbrtd
+  rd[idx++] = std::cbrt(ip[0]);
+
+  // CHECK: __spirv_ocl_hypotff
+  rf[idx++] = std::hypot(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_hypotff
+  rf[idx++] = std::hypotf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_hypotdd
+  rd[idx++] = std::hypot(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_hypotdd
+  rd[idx++] = std::hypot(ip[0], fp[1]);
+  // CHECK: __spirv_ocl_hypotdd
+  rd[idx++] = std::hypot(dp[0], fp[1]);
+
+  // CHECK: __spirv_ocl_sinf
+  rf[idx++] = std::sin(fp[0]);
+  // CHECK: __spirv_ocl_sinf
+  rf[idx++] = std::sinf(fp[1]);
+  // CHECK: __spirv_ocl_sind
+  rd[idx++] = std::sin(dp[0]);
+  // CHECK: __spirv_ocl_sind
+  rd[idx++] = std::sin(ip[0]);
+
+  // CHECK: __spirv_ocl_cosf
+  rf[idx++] = std::cos(fp[0]);
+  // CHECK: __spirv_ocl_cosf
+  rf[idx++] = std::cosf(fp[1]);
+  // CHECK: __spirv_ocl_cosd
+  rd[idx++] = std::cos(dp[0]);
+  // CHECK: __spirv_ocl_cosd
+  rd[idx++] = std::cos(ip[0]);
+
+  // CHECK: __spirv_ocl_tanf
+  rf[idx++] = std::tan(fp[0]);
+  // CHECK: __spirv_ocl_tanf
+  rf[idx++] = std::tanf(fp[1]);
+  // CHECK: __spirv_ocl_tand
+  rd[idx++] = std::tan(dp[0]);
+  // CHECK: __spirv_ocl_tand
+  rd[idx++] = std::tan(ip[0]);
+
+  // CHECK: __spirv_ocl_asinf
+  rf[idx++] = std::asin(fp[0]);
+  // CHECK: __spirv_ocl_asinf
+  rf[idx++] = std::asinf(fp[1]);
+  // CHECK: __spirv_ocl_asind
+  rd[idx++] = std::asin(dp[0]);
+  // CHECK: __spirv_ocl_asind
+  rd[idx++] = std::asin(ip[0]);
+
+  // CHECK: __spirv_ocl_acosf
+  rf[idx++] = std::acos(fp[0]);
+  // CHECK: __spirv_ocl_acosf
+  rf[idx++] = std::acosf(fp[1]);
+  // CHECK: __spirv_ocl_acosd
+  rd[idx++] = std::acos(dp[0]);
+  // CHECK: __spirv_ocl_acosd
+  rd[idx++] = std::acos(ip[0]);
+
+  // CHECK: __spirv_ocl_atanf
+  rf[idx++] = std::atan(fp[0]);
+  // CHECK: __spirv_ocl_atanf
+  rf[idx++] = std::atanf(fp[1]);
+  // CHECK: __spirv_ocl_atand
+  rd[idx++] = std::atan(dp[0]);
+  // CHECK: __spirv_ocl_atand
+  rd[idx++] = std::atan(ip[0]);
+
+  // CHECK: __spirv_ocl_atan2ff
+  rf[idx++] = std::atan2(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_atan2ff
+  rf[idx++] = std::atan2f(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_atan2dd
+  rd[idx++] = std::atan2(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_atan2dd
+  rd[idx++] = std::atan2(ip[0], fp[1]);
+  // CHECK: __spirv_ocl_atan2dd
+  rd[idx++] = std::atan2(dp[0], fp[1]);
+
+  // CHECK: __spirv_ocl_sinhf
+  rf[idx++] = std::sinh(fp[0]);
+  // CHECK: __spirv_ocl_sinhf
+  rf[idx++] = std::sinhf(fp[1]);
+  // CHECK: __spirv_ocl_sinhd
+  rd[idx++] = std::sinh(dp[0]);
+  // CHECK: __spirv_ocl_sinhd
+  rd[idx++] = std::sinh(ip[0]);
+
+  // CHECK: __spirv_ocl_coshf
+  rf[idx++] = std::cosh(fp[0]);
+  // CHECK: __spirv_ocl_coshf
+  rf[idx++] = std::coshf(fp[1]);
+  // CHECK: __spirv_ocl_coshd
+  rd[idx++] = std::cosh(dp[0]);
+  // CHECK: __spirv_ocl_coshd
+  rd[idx++] = std::cosh(ip[0]);
+
+  // CHECK: __spirv_ocl_tanhf
+  rf[idx++] = std::tanh(fp[0]);
+  // CHECK: __spirv_ocl_tanhf
+  rf[idx++] = std::tanhf(fp[1]);
+  // CHECK: __spirv_ocl_tanhd
+  rd[idx++] = std::tanh(dp[0]);
+  // CHECK: __spirv_ocl_tanhd
+  rd[idx++] = std::tanh(ip[0]);
+
+  // CHECK: __spirv_ocl_asinhf
+  rf[idx++] = std::asinh(fp[0]);
+  // CHECK: __spirv_ocl_asinhf
+  rf[idx++] = std::asinhf(fp[1]);
+  // CHECK: __spirv_ocl_asinhd
+  rd[idx++] = std::asinh(dp[0]);
+  // CHECK: __spirv_ocl_asinhd
+  rd[idx++] = std::asinh(ip[0]);
+
+  // CHECK: __spirv_ocl_acoshf
+  rf[idx++] = std::acosh(fp[0]);
+  // CHECK: __spirv_ocl_acoshf
+  rf[idx++] = std::acoshf(fp[1]);
+  // CHECK: __spirv_ocl_acoshd
+  rd[idx++] = std::acosh(dp[0]);
+  // CHECK: __spirv_ocl_acoshd
+  rd[idx++] = std::acosh(ip[0]);
+
+  // CHECK: __spirv_ocl_atanhf
+  rf[idx++] = std::atanh(fp[0]);
+  // CHECK: __spirv_ocl_atanhf
+  rf[idx++] = std::atanhf(fp[1]);
+  // CHECK: __spirv_ocl_atanhd
+  rd[idx++] = std::atanh(dp[0]);
+  // CHECK: __spirv_ocl_atanhd
+  rd[idx++] = std::atanh(ip[0]);
+
+  // CHECK: __spirv_ocl_erff
+  rf[idx++] = std::erf(fp[0]);
+  // CHECK: __spirv_ocl_erff
+  rf[idx++] = std::erff(fp[1]);
+  // CHECK: __spirv_ocl_erfd
+  rd[idx++] = std::erf(dp[0]);
+  // CHECK: __spirv_ocl_erfd
+  rd[idx++] = std::erf(ip[0]);
+
+  // CHECK: __spirv_ocl_erfcf
+  rf[idx++] = std::erfc(fp[0]);
+  // CHECK: __spirv_ocl_erfcf
+  rf[idx++] = std::erfcf(fp[1]);
+  // CHECK: __spirv_ocl_erfcd
+  rd[idx++] = std::erfc(dp[0]);
+  // CHECK: __spirv_ocl_erfcd
+  rd[idx++] = std::erfc(ip[0]);
+
+  // CHECK: __spirv_ocl_tgammaf
+  rf[idx++] = std::tgamma(fp[0]);
+  // CHECK: __spirv_ocl_tgammaf
+  rf[idx++] = std::tgammaf(fp[1]);
+  // CHECK: __spirv_ocl_tgammad
+  rd[idx++] = std::tgamma(dp[0]);
+  // CHECK: __spirv_ocl_tgammad
+  rd[idx++] = std::tgamma(ip[0]);
+
+  // CHECK: __spirv_ocl_lgammaf
+  rf[idx++] = std::lgamma(fp[0]);
+  // CHECK: __spirv_ocl_lgammaf
+  rf[idx++] = std::lgammaf(fp[1]);
+  // CHECK: __spirv_ocl_lgammad
+  rd[idx++] = std::lgamma(dp[0]);
+  // CHECK: __spirv_ocl_lgammad
+  rd[idx++] = std::lgamma(ip[0]);
+
+  // CHECK: __spirv_ocl_ceilf
+  rf[idx++] = std::ceil(fp[0]);
+  // CHECK: __spirv_ocl_ceilf
+  rf[idx++] = std::ceilf(fp[1]);
+  // CHECK: __spirv_ocl_ceild
+  rd[idx++] = std::ceil(dp[0]);
+  // CHECK: __spirv_ocl_ceild
+  rd[idx++] = std::ceil(ip[0]);
+
+  // CHECK: __spirv_ocl_floorf
+  rf[idx++] = std::floor(fp[0]);
+  // CHECK: __spirv_ocl_floorf
+  rf[idx++] = std::floorf(fp[1]);
+  // CHECK: __spirv_ocl_floord
+  rd[idx++] = std::floor(dp[0]);
+  // CHECK: __spirv_ocl_floord
+  rd[idx++] = std::floor(ip[0]);
+
+  // CHECK: __spirv_ocl_truncf
+  rf[idx++] = std::trunc(fp[0]);
+  // CHECK: __spirv_ocl_truncf
+  rf[idx++] = std::truncf(fp[1]);
+  // CHECK: __spirv_ocl_truncd
+  rd[idx++] = std::trunc(dp[0]);
+  // CHECK: __spirv_ocl_truncd
+  rd[idx++] = std::trunc(ip[0]);
+
+  // CHECK: __spirv_ocl_roundf
+  rf[idx++] = std::round(fp[0]);
+  // CHECK: __spirv_ocl_roundf
+  rf[idx++] = std::roundf(fp[1]);
+  // CHECK: __spirv_ocl_roundd
+  rd[idx++] = std::round(dp[0]);
+  // CHECK: __spirv_ocl_roundd
+  rd[idx++] = std::round(ip[0]);
+
+  // CHECK: __spirv_ocl_rintf
+  rf[idx++] = std::rint(fp[0]);
+  // CHECK: __spirv_ocl_rintf
+  rf[idx++] = std::rintf(fp[1]);
+  // CHECK: __spirv_ocl_rintd
+  rd[idx++] = std::rint(dp[0]);
+  // CHECK: __spirv_ocl_rintd
+  rd[idx++] = std::rint(ip[0]);
+
+  // CHECK: __spirv_ocl_frexpf
+  rf[idx++] = std::frexp(fp[0], ip);
+  // CHECK: __spirv_ocl_frexpf
+  rf[idx++] = std::frexpf(fp[1], ip);
+  // CHECK: __spirv_ocl_frexpd
+  rd[idx++] = std::frexp(dp[0], ip);
+  // CHECK: __spirv_ocl_frexpd
+  rd[idx++] = std::frexp(ip[0], ip);
+
+  // CHECK: __spirv_ocl_ldexpf
+  rf[idx++] = std::ldexp(fp[0], ip[0]);
+  // CHECK: __spirv_ocl_ldexpf
+  rf[idx++] = std::ldexpf(fp[1], ip[0]);
+  // CHECK: __spirv_ocl_ldexpd
+  rd[idx++] = std::ldexp(dp[0], ip[0]);
+  // CHECK: __spirv_ocl_ldexpd
+  rd[idx++] = std::ldexp(ip[0], ip[0]);
+
+  // CHECK: __spirv_ocl_modff
+  rf[idx++] = std::modf(fp[0], fp);
+  // CHECK: __spirv_ocl_modff
+  rf[idx++] = std::modff(fp[1], fp);
+  // CHECK: __spirv_ocl_modfd
+  rd[idx++] = std::modf(dp[0], dp);
+  // CHECK: __spirv_ocl_modfd
+  rd[idx++] = std::modf(ip[0], dp);
+
+  // CHECK: __spirv_ocl_ldexpf
+  rf[idx++] = std::scalbn(fp[0], ip[0]);
+  // CHECK: __spirv_ocl_ldexpf
+  rf[idx++] = std::scalbnf(fp[1], ip[0]);
+  // CHECK: __spirv_ocl_ldexpd
+  rd[idx++] = std::scalbn(dp[0], ip[0]);
+  // CHECK: __spirv_ocl_ldexpd
+  rd[idx++] = std::scalbn(ip[0], ip[0]);
+
+  // CHECK: __spirv_ocl_ilogbf
+  ri[idx++] = std::ilogb(fp[0]);
+  // CHECK: __spirv_ocl_ilogbf
+  ri[idx++] = std::ilogbf(fp[1]);
+  // CHECK: __spirv_ocl_ilogbd
+  ri[idx++] = std::ilogb(dp[0]);
+  // CHECK: __spirv_ocl_ilogbd
+  ri[idx++] = std::ilogb(ip[0]);
+
+  // CHECK: __spirv_ocl_logbf
+  rf[idx++] = std::logb(fp[0]);
+  // CHECK: __spirv_ocl_logbf
+  rf[idx++] = std::logbf(fp[1]);
+  // CHECK: __spirv_ocl_logbd
+  rd[idx++] = std::logb(dp[0]);
+  // CHECK: __spirv_ocl_logbd
+  rd[idx++] = std::logb(ip[0]);
+
+  // CHECK: __spirv_ocl_nextafterf
+  rf[idx++] = std::nextafter(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_nextafterf
+  rf[idx++] = std::nextafterf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_nextafterd
+  rd[idx++] = std::nextafter(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_nextafterd
+  rd[idx++] = std::nextafter(ip[0], fp[1]);
+  // CHECK: __spirv_ocl_nextafterd
+  rd[idx++] = std::nextafter(dp[0], fp[1]);
+
+  // CHECK: __spirv_ocl_copysignf
+  rf[idx++] = std::copysign(fp[0], fp[1]);
+  // CHECK: __spirv_ocl_copysignf
+  rf[idx++] = std::copysignf(fp[2], fp[1]);
+  // CHECK: __spirv_ocl_copysignd
+  rd[idx++] = std::copysign(dp[0], dp[1]);
+  // CHECK: __spirv_ocl_copysignd
+  rd[idx++] = std::copysign(ip[0], fp[1]);
+  // CHECK: __spirv_ocl_copysignd
+  rd[idx++] = std::copysign(dp[0], fp[1]);
+}