From 40ce085598dc2f9966ed1e6a75544fd452738d2a Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 19 Oct 2024 18:17:54 +0200
Subject: [PATCH 1/2] Warper: improve (a bit) performance of multi-band bicubic
 warping

On extracted2.tif dataset from
https://github.com/OSGeo/gdal/issues/11042#issuecomment-2420556574 (12500 x 10000 pixels, 3 bands, Byte)
``gdalwarp -s_srs EPSG:8353 -t_srs EPSG:3857 -r cubic extracted2.tif tmp.tif -overwrite``
goes from 30.4 seconds to 26.0.
---
 alg/gdalwarpkernel.cpp | 307 +++++++++++++++++++++++------------------
 1 file changed, 169 insertions(+), 138 deletions(-)
diff --git a/alg/gdalwarpkernel.cpp b/alg/gdalwarpkernel.cpp
index 565a6657a189..9053cb24cddf 100644
--- a/alg/gdalwarpkernel.cpp
+++ b/alg/gdalwarpkernel.cpp
@@ -4100,14 +4100,76 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
     return true;
 }
 
+/************************************************************************/
+/*                        GWKComputeWeights()                           */
+/************************************************************************/
+
+static void GWKComputeWeights(GDALResampleAlg eResample, int iMin, int iMax,
+                              double dfDeltaX, double dfXScale, int jMin,
+                              int jMax, double dfDeltaY, double dfYScale,
+                              double *padfWeightsHorizontal,
+                              double *padfWeightsVertical, double &dfInvWeights)
+{
+
+    const FilterFuncType pfnGetWeight = apfGWKFilter[eResample];
+    CPLAssert(pfnGetWeight);
+    const FilterFunc4ValuesType pfnGetWeight4Values =
+        apfGWKFilter4Values[eResample];
+    CPLAssert(pfnGetWeight4Values);
+
+    int i = iMin;  // Used after for.
+    int iC = 0;    // Used after for.
+    double dfAccumulatorWeightHorizontal = 0.0;
+    for (; i + 2 < iMax; i += 4, iC += 4)
+    {
+        padfWeightsHorizontal[iC] = (i - dfDeltaX) * dfXScale;
+        padfWeightsHorizontal[iC + 1] = padfWeightsHorizontal[iC] + dfXScale;
+        padfWeightsHorizontal[iC + 2] =
+            padfWeightsHorizontal[iC + 1] + dfXScale;
+        padfWeightsHorizontal[iC + 3] =
+            padfWeightsHorizontal[iC + 2] + dfXScale;
+        dfAccumulatorWeightHorizontal +=
+            pfnGetWeight4Values(padfWeightsHorizontal + iC);
+    }
+    for (; i <= iMax; ++i, ++iC)
+    {
+        const double dfWeight = pfnGetWeight((i - dfDeltaX) * dfXScale);
+        padfWeightsHorizontal[iC] = dfWeight;
+        dfAccumulatorWeightHorizontal += dfWeight;
+    }
+
+    int j = jMin;  // Used after for.
+    int jC = 0;    // Used after for.
+    double dfAccumulatorWeightVertical = 0.0;
+    for (; j + 2 < jMax; j += 4, jC += 4)
+    {
+        padfWeightsVertical[jC] = (j - dfDeltaY) * dfYScale;
+        padfWeightsVertical[jC + 1] = padfWeightsVertical[jC] + dfYScale;
+        padfWeightsVertical[jC + 2] = padfWeightsVertical[jC + 1] + dfYScale;
+        padfWeightsVertical[jC + 3] = padfWeightsVertical[jC + 2] + dfYScale;
+        dfAccumulatorWeightVertical +=
+            pfnGetWeight4Values(padfWeightsVertical + jC);
+    }
+    for (; j <= jMax; ++j, ++jC)
+    {
+        const double dfWeight = pfnGetWeight((j - dfDeltaY) * dfYScale);
+        padfWeightsVertical[jC] = dfWeight;
+        dfAccumulatorWeightVertical += dfWeight;
+    }
+
+    dfInvWeights =
+        1. / (dfAccumulatorWeightHorizontal * dfAccumulatorWeightVertical);
+}
+
 /************************************************************************/
 /*                        GWKResampleNoMasksT()                         */
 /************************************************************************/
 
 template <class T>
-static bool GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand,
-                                double dfSrcX, double dfSrcY, T *pValue,
-                                double *padfWeight)
+static bool
+GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand, double dfSrcX,
+                    double dfSrcY, T *pValue, double *padfWeightsHorizontal,
+                    double *padfWeightsVertical, double &dfInvWeights)
 
 {
     // Commonly used; save locally.
@@ -4132,52 +4194,33 @@ static bool GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand,
     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
 
-    const FilterFuncType pfnGetWeight = apfGWKFilter[poWK->eResample];
-    CPLAssert(pfnGetWeight);
-    const FilterFunc4ValuesType pfnGetWeight4Values =
-        apfGWKFilter4Values[poWK->eResample];
-    CPLAssert(pfnGetWeight4Values);
-
     const double dfXScale = std::min(poWK->dfXScale, 1.0);
     const double dfYScale = std::min(poWK->dfYScale, 1.0);
 
-    // Loop over all rows in the kernel.
-    double dfAccumulatorWeightHorizontal = 0.0;
-    double dfAccumulatorWeightVertical = 0.0;
-
     int iMin = 1 - nXRadius;
     if (iSrcX + iMin < 0)
         iMin = -iSrcX;
     int iMax = nXRadius;
     if (iSrcX + iMax >= nSrcXSize - 1)
         iMax = nSrcXSize - 1 - iSrcX;
-    int i = iMin;  // Used after for.
-    int iC = 0;    // Used after for.
-    for (; i + 2 < iMax; i += 4, iC += 4)
-    {
-        padfWeight[iC] = (i - dfDeltaX) * dfXScale;
-        padfWeight[iC + 1] = padfWeight[iC] + dfXScale;
-        padfWeight[iC + 2] = padfWeight[iC + 1] + dfXScale;
-        padfWeight[iC + 3] = padfWeight[iC + 2] + dfXScale;
-        dfAccumulatorWeightHorizontal += pfnGetWeight4Values(padfWeight + iC);
-    }
-    for (; i <= iMax; ++i, ++iC)
-    {
-        const double dfWeight = pfnGetWeight((i - dfDeltaX) * dfXScale);
-        padfWeight[iC] = dfWeight;
-        dfAccumulatorWeightHorizontal += dfWeight;
-    }
 
-    int j = 1 - nYRadius;
-    if (iSrcY + j < 0)
-        j = -iSrcY;
+    int jMin = 1 - nYRadius;
+    if (iSrcY + jMin < 0)
+        jMin = -iSrcY;
     int jMax = nYRadius;
     if (iSrcY + jMax >= nSrcYSize - 1)
         jMax = nSrcYSize - 1 - iSrcY;
 
-    double dfAccumulator = 0.0;
+    if (iBand == 0)
+    {
+        GWKComputeWeights(poWK->eResample, iMin, iMax, dfDeltaX, dfXScale, jMin,
+                          jMax, dfDeltaY, dfYScale, padfWeightsHorizontal,
+                          padfWeightsVertical, dfInvWeights);
+    }
 
-    for (; j <= jMax; ++j)
+    // Loop over all rows in the kernel.
+    double dfAccumulator = 0.0;
+    for (int jC = 0, j = jMin; j <= jMax; ++j, ++jC)
     {
         const GPtrDiff_t iSampJ =
             iSrcOffset + static_cast<GPtrDiff_t>(j) * nSrcXSize;
@@ -4185,42 +4228,41 @@ static bool GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand,
         // Loop over all pixels in the row.
         double dfAccumulatorLocal = 0.0;
         double dfAccumulatorLocal2 = 0.0;
-        iC = 0;
-        i = iMin;
+        int iC = 0;
+        int i = iMin;
         // Process by chunk of 4 cols.
         for (; i + 2 < iMax; i += 4, iC += 4)
         {
             // Retrieve the pixel & accumulate.
-            dfAccumulatorLocal += pSrcBand[i + iSampJ] * padfWeight[iC];
-            dfAccumulatorLocal += pSrcBand[i + 1 + iSampJ] * padfWeight[iC + 1];
+            dfAccumulatorLocal +=
+                pSrcBand[i + iSampJ] * padfWeightsHorizontal[iC];
+            dfAccumulatorLocal +=
+                pSrcBand[i + 1 + iSampJ] * padfWeightsHorizontal[iC + 1];
             dfAccumulatorLocal2 +=
-                pSrcBand[i + 2 + iSampJ] * padfWeight[iC + 2];
+                pSrcBand[i + 2 + iSampJ] * padfWeightsHorizontal[iC + 2];
             dfAccumulatorLocal2 +=
-                pSrcBand[i + 3 + iSampJ] * padfWeight[iC + 3];
+                pSrcBand[i + 3 + iSampJ] * padfWeightsHorizontal[iC + 3];
         }
         dfAccumulatorLocal += dfAccumulatorLocal2;
         if (i < iMax)
         {
-            dfAccumulatorLocal += pSrcBand[i + iSampJ] * padfWeight[iC];
-            dfAccumulatorLocal += pSrcBand[i + 1 + iSampJ] * padfWeight[iC + 1];
+            dfAccumulatorLocal +=
+                pSrcBand[i + iSampJ] * padfWeightsHorizontal[iC];
+            dfAccumulatorLocal +=
+                pSrcBand[i + 1 + iSampJ] * padfWeightsHorizontal[iC + 1];
             i += 2;
             iC += 2;
         }
         if (i == iMax)
         {
-            dfAccumulatorLocal += pSrcBand[i + iSampJ] * padfWeight[iC];
+            dfAccumulatorLocal +=
+                pSrcBand[i + iSampJ] * padfWeightsHorizontal[iC];
         }
 
-        // Calculate the Y weight.
-        const double dfWeight = pfnGetWeight((j - dfDeltaY) * dfYScale);
-        dfAccumulator += dfWeight * dfAccumulatorLocal;
-        dfAccumulatorWeightVertical += dfWeight;
+        dfAccumulator += padfWeightsVertical[jC] * dfAccumulatorLocal;
     }
 
-    const double dfAccumulatorWeight =
-        dfAccumulatorWeightHorizontal * dfAccumulatorWeightVertical;
-
-    *pValue = GWKClampValueT<T>(dfAccumulator / dfAccumulatorWeight);
+    *pValue = GWKClampValueT<T>(dfAccumulator * dfInvWeights);
 
     return true;
 }
@@ -4236,7 +4278,9 @@ static bool GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand,
 template <class T>
 static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
                                       double dfSrcX, double dfSrcY, T *pValue,
-                                      double *padfWeight)
+                                      double *padfWeightsHorizontal,
+                                      double *padfWeightsVertical,
+                                      double &dfInvWeights)
 {
     // Commonly used; save locally.
     const int nSrcXSize = poWK->nSrcXSize;
@@ -4258,60 +4302,42 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
     const T *pSrcBand =
         reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
 
-    const FilterFuncType pfnGetWeight = apfGWKFilter[poWK->eResample];
-    CPLAssert(pfnGetWeight);
-    const FilterFunc4ValuesType pfnGetWeight4Values =
-        apfGWKFilter4Values[poWK->eResample];
-    CPLAssert(pfnGetWeight4Values);
-
     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
     const double dfXScale = std::min(poWK->dfXScale, 1.0);
     const double dfYScale = std::min(poWK->dfYScale, 1.0);
 
-    // Loop over all rows in the kernel.
-    double dfAccumulatorWeightHorizontal = 0.0;
-    double dfAccumulatorWeightVertical = 0.0;
-    double dfAccumulator = 0.0;
-
     int iMin = 1 - nXRadius;
     if (iSrcX + iMin < 0)
         iMin = -iSrcX;
     int iMax = nXRadius;
     if (iSrcX + iMax >= nSrcXSize - 1)
         iMax = nSrcXSize - 1 - iSrcX;
-    int i, iC;
-    for (iC = 0, i = iMin; i + 2 < iMax; i += 4, iC += 4)
-    {
-        padfWeight[iC] = (i - dfDeltaX) * dfXScale;
-        padfWeight[iC + 1] = padfWeight[iC] + dfXScale;
-        padfWeight[iC + 2] = padfWeight[iC + 1] + dfXScale;
-        padfWeight[iC + 3] = padfWeight[iC + 2] + dfXScale;
-        dfAccumulatorWeightHorizontal += pfnGetWeight4Values(padfWeight + iC);
-    }
-    for (; i <= iMax; ++i, ++iC)
-    {
-        double dfWeight = pfnGetWeight((i - dfDeltaX) * dfXScale);
-        padfWeight[iC] = dfWeight;
-        dfAccumulatorWeightHorizontal += dfWeight;
-    }
 
-    int j = 1 - nYRadius;
-    if (iSrcY + j < 0)
-        j = -iSrcY;
+    int jMin = 1 - nYRadius;
+    if (iSrcY + jMin < 0)
+        jMin = -iSrcY;
     int jMax = nYRadius;
     if (iSrcY + jMax >= nSrcYSize - 1)
         jMax = nSrcYSize - 1 - iSrcY;
 
-    // Process by chunk of 4 rows.
-    for (; j + 2 < jMax; j += 4)
+    if (iBand == 0)
     {
-        const GPtrDiff_t iSampJ =
-            iSrcOffset + static_cast<GPtrDiff_t>(j) * nSrcXSize;
+        GWKComputeWeights(poWK->eResample, iMin, iMax, dfDeltaX, dfXScale, jMin,
+                          jMax, dfDeltaY, dfYScale, padfWeightsHorizontal,
+                          padfWeightsVertical, dfInvWeights);
+    }
 
+    GPtrDiff_t iSampJ = iSrcOffset + static_cast<GPtrDiff_t>(jMin) * nSrcXSize;
+    // Process by chunk of 4 rows.
+    int jC = 0;
+    int j = jMin;
+    double dfAccumulator = 0.0;
+    for (; j + 2 < jMax; j += 4, iSampJ += 4 * nSrcXSize, jC += 4)
+    {
         // Loop over all pixels in the row.
-        iC = 0;
-        i = iMin;
+        int iC = 0;
+        int i = iMin;
         // Process by chunk of 4 cols.
         XMMReg4Double v_acc_1 = XMMReg4Double::Zero();
         XMMReg4Double v_acc_2 = XMMReg4Double::Zero();
@@ -4330,7 +4356,7 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
                 XMMReg4Double::Load4Val(pSrcBand + i + iSampJ + 3 * nSrcXSize);
 
             XMMReg4Double v_padfWeight =
-                XMMReg4Double::Load4Val(padfWeight + iC);
+                XMMReg4Double::Load4Val(padfWeightsHorizontal + iC);
 
             v_acc_1 += v_pixels_1 * v_padfWeight;
             v_acc_2 += v_pixels_2 * v_padfWeight;
@@ -4350,7 +4376,7 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
                 XMMReg2Double::Load2Val(pSrcBand + i + iSampJ + 3 * nSrcXSize);
 
             XMMReg2Double v_padfWeight =
-                XMMReg2Double::Load2Val(padfWeight + iC);
+                XMMReg2Double::Load2Val(padfWeightsHorizontal + iC);
 
             v_acc_1.AddToLow(v_pixels_1 * v_padfWeight);
             v_acc_2.AddToLow(v_pixels_2 * v_padfWeight);
@@ -4368,40 +4394,29 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
 
         if (i == iMax)
         {
-            dfAccumulatorLocal_1 +=
-                static_cast<double>(pSrcBand[i + iSampJ]) * padfWeight[iC];
+            dfAccumulatorLocal_1 += static_cast<double>(pSrcBand[i + iSampJ]) *
+                                    padfWeightsHorizontal[iC];
             dfAccumulatorLocal_2 +=
                 static_cast<double>(pSrcBand[i + iSampJ + nSrcXSize]) *
-                padfWeight[iC];
+                padfWeightsHorizontal[iC];
             dfAccumulatorLocal_3 +=
                 static_cast<double>(pSrcBand[i + iSampJ + 2 * nSrcXSize]) *
-                padfWeight[iC];
+                padfWeightsHorizontal[iC];
             dfAccumulatorLocal_4 +=
                 static_cast<double>(pSrcBand[i + iSampJ + 3 * nSrcXSize]) *
-                padfWeight[iC];
+                padfWeightsHorizontal[iC];
         }
 
-        // Calculate the Y weight.
-        const double dfWeight0 = (j - dfDeltaY) * dfYScale;
-        const double dfWeight1 = dfWeight0 + dfYScale;
-        const double dfWeight2 = dfWeight1 + dfYScale;
-        const double dfWeight3 = dfWeight2 + dfYScale;
-        double adfWeight[4] = {dfWeight0, dfWeight1, dfWeight2, dfWeight3};
-
-        dfAccumulatorWeightVertical += pfnGetWeight4Values(adfWeight);
-        dfAccumulator += adfWeight[0] * dfAccumulatorLocal_1;
-        dfAccumulator += adfWeight[1] * dfAccumulatorLocal_2;
-        dfAccumulator += adfWeight[2] * dfAccumulatorLocal_3;
-        dfAccumulator += adfWeight[3] * dfAccumulatorLocal_4;
+        dfAccumulator += padfWeightsVertical[jC] * dfAccumulatorLocal_1;
+        dfAccumulator += padfWeightsVertical[jC + 1] * dfAccumulatorLocal_2;
+        dfAccumulator += padfWeightsVertical[jC + 2] * dfAccumulatorLocal_3;
+        dfAccumulator += padfWeightsVertical[jC + 3] * dfAccumulatorLocal_4;
     }
-    for (; j <= jMax; ++j)
+    for (; j <= jMax; ++j, iSampJ += nSrcXSize, ++jC)
     {
-        const GPtrDiff_t iSampJ =
-            iSrcOffset + static_cast<GPtrDiff_t>(j) * nSrcXSize;
-
         // Loop over all pixels in the row.
-        iC = 0;
-        i = iMin;
+        int iC = 0;
+        int i = iMin;
         // Process by chunk of 4 cols.
         XMMReg4Double v_acc = XMMReg4Double::Zero();
         for (; i + 2 < iMax; i += 4, iC += 4)
@@ -4410,7 +4425,7 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
             XMMReg4Double v_pixels =
                 XMMReg4Double::Load4Val(pSrcBand + i + iSampJ);
             XMMReg4Double v_padfWeight =
-                XMMReg4Double::Load4Val(padfWeight + iC);
+                XMMReg4Double::Load4Val(padfWeightsHorizontal + iC);
 
             v_acc += v_pixels * v_padfWeight;
         }
@@ -4419,27 +4434,23 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
 
         if (i < iMax)
         {
-            dfAccumulatorLocal += pSrcBand[i + iSampJ] * padfWeight[iC];
-            dfAccumulatorLocal += pSrcBand[i + 1 + iSampJ] * padfWeight[iC + 1];
+            dfAccumulatorLocal +=
+                pSrcBand[i + iSampJ] * padfWeightsHorizontal[iC];
+            dfAccumulatorLocal +=
+                pSrcBand[i + 1 + iSampJ] * padfWeightsHorizontal[iC + 1];
             i += 2;
             iC += 2;
         }
         if (i == iMax)
         {
-            dfAccumulatorLocal +=
-                static_cast<double>(pSrcBand[i + iSampJ]) * padfWeight[iC];
+            dfAccumulatorLocal += static_cast<double>(pSrcBand[i + iSampJ]) *
+                                  padfWeightsHorizontal[iC];
         }
 
-        // Calculate the Y weight.
-        double dfWeight = pfnGetWeight((j - dfDeltaY) * dfYScale);
-        dfAccumulator += dfWeight * dfAccumulatorLocal;
-        dfAccumulatorWeightVertical += dfWeight;
+        dfAccumulator += padfWeightsVertical[jC] * dfAccumulatorLocal;
     }
 
-    const double dfAccumulatorWeight =
-        dfAccumulatorWeightHorizontal * dfAccumulatorWeightVertical;
-
-    *pValue = GWKClampValueT<T>(dfAccumulator / dfAccumulatorWeight);
+    *pValue = GWKClampValueT<T>(dfAccumulator * dfInvWeights);
 
     return true;
 }
@@ -4451,10 +4462,13 @@ static bool GWKResampleNoMasks_SSE2_T(const GDALWarpKernel *poWK, int iBand,
 template <>
 bool GWKResampleNoMasksT<GByte>(const GDALWarpKernel *poWK, int iBand,
                                 double dfSrcX, double dfSrcY, GByte *pValue,
-                                double *padfWeight)
+                                double *padfWeightsHorizontal,
+                                double *padfWeightsVertical,
+                                double &dfInvWeights)
 {
     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY, pValue,
-                                     padfWeight);
+                                     padfWeightsHorizontal, padfWeightsVertical,
+                                     dfInvWeights);
 }
 
 /************************************************************************/
@@ -4464,10 +4478,13 @@ bool GWKResampleNoMasksT<GByte>(const GDALWarpKernel *poWK, int iBand,
 template <>
 bool GWKResampleNoMasksT<GInt16>(const GDALWarpKernel *poWK, int iBand,
                                  double dfSrcX, double dfSrcY, GInt16 *pValue,
-                                 double *padfWeight)
+                                 double *padfWeightsHorizontal,
+                                 double *padfWeightsVertical,
+                                 double &dfInvWeights)
 {
     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY, pValue,
-                                     padfWeight);
+                                     padfWeightsHorizontal, padfWeightsVertical,
+                                     dfInvWeights);
 }
 
 /************************************************************************/
@@ -4477,10 +4494,13 @@ bool GWKResampleNoMasksT<GInt16>(const GDALWarpKernel *poWK, int iBand,
 template <>
 bool GWKResampleNoMasksT<GUInt16>(const GDALWarpKernel *poWK, int iBand,
                                   double dfSrcX, double dfSrcY, GUInt16 *pValue,
-                                  double *padfWeight)
+                                  double *padfWeightsHorizontal,
+                                  double *padfWeightsVertical,
+                                  double &dfInvWeights)
 {
     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY, pValue,
-                                     padfWeight);
+                                     padfWeightsHorizontal, padfWeightsVertical,
+                                     dfInvWeights);
 }
 
 /************************************************************************/
@@ -4490,10 +4510,13 @@ bool GWKResampleNoMasksT<GUInt16>(const GDALWarpKernel *poWK, int iBand,
 template <>
 bool GWKResampleNoMasksT<float>(const GDALWarpKernel *poWK, int iBand,
                                 double dfSrcX, double dfSrcY, float *pValue,
-                                double *padfWeight)
+                                double *padfWeightsHorizontal,
+                                double *padfWeightsVertical,
+                                double &dfInvWeights)
 {
     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY, pValue,
-                                     padfWeight);
+                                     padfWeightsHorizontal, padfWeightsVertical,
+                                     dfInvWeights);
 }
 
 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
@@ -4505,10 +4528,13 @@ bool GWKResampleNoMasksT<float>(const GDALWarpKernel *poWK, int iBand,
 template <>
 bool GWKResampleNoMasksT<double>(const GDALWarpKernel *poWK, int iBand,
                                  double dfSrcX, double dfSrcY, double *pValue,
-                                 double *padfWeight)
+                                 double *padfWeightsHorizontal,
+                                 double *padfWeightsVertical,
+                                 double &dfInvWeights)
 {
     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY, pValue,
-                                     padfWeight);
+                                     padfWeightsHorizontal, padfWeightsVertical,
+                                     dfInvWeights);
 }
 
 #endif /* INSTANTIATE_FLOAT64_SSE2_IMPL */
@@ -5865,8 +5891,10 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
 
     const int nXRadius = poWK->nXRadius;
-    double *padfWeight =
+    double *padfWeightsX =
         static_cast<double *>(CPLCalloc(1 + nXRadius * 2, sizeof(double)));
+    double *padfWeightsY = static_cast<double *>(
+        CPLCalloc(1 + poWK->nYRadius * 2, sizeof(double)));
     const double dfSrcCoordPrecision = CPLAtof(CSLFetchNameValueDef(
         poWK->papszWarpOptions, "SRC_COORD_PRECISION", "0"));
     const double dfErrorThreshold = CPLAtof(
@@ -5929,6 +5957,7 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
             const GPtrDiff_t iDstOffset =
                 iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
 
+            [[maybe_unused]] double dfInvWeights = 0;
             for (int iBand = 0; iBand < poWK->nBands; iBand++)
             {
                 T value = 0;
@@ -5952,7 +5981,8 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
                 {
                     GWKResampleNoMasksT(
                         poWK, iBand, padfX[iDstX] - poWK->nSrcXOff,
-                        padfY[iDstX] - poWK->nSrcYOff, &value, padfWeight);
+                        padfY[iDstX] - poWK->nSrcYOff, &value, padfWeightsX,
+                        padfWeightsY, dfInvWeights);
                 }
 
                 if (poWK->bApplyVerticalShift)
@@ -5990,7 +6020,8 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
     CPLFree(padfY);
     CPLFree(padfZ);
     CPLFree(pabSuccess);
-    CPLFree(padfWeight);
+    CPLFree(padfWeightsX);
+    CPLFree(padfWeightsY);
 }
 
 template <class T, GDALResampleAlg eResample>

From 461e1ad7c85ebf6837cf4b4af0f833dc2b831f7d Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 19 Oct 2024 23:38:43 +0200
Subject: [PATCH 2/2] Warper: improve performance of Byte/UInt16 multi-band
 bicubic warping with XSCALE=1 and YSCALE=1 on SSE2 / x86_64

On extracted2.tif dataset from
https://github.com/OSGeo/gdal/issues/11042#issuecomment-2420556574 (12500 x 10000 pixels, 3 bands, Byte) ``gdalwarp -s_srs EPSG:8353 -t_srs EPSG:3857 -r cubic extracted2.tif tmp.tif -overwrite -wo XSCALE=1 -wo YSCALE=1`` goes from 19.0 seconds to 15.7.
---
 alg/gdalwarpkernel.cpp                  | 202 +++++++++++++++++++++---
 autotest/utilities/test_gdalwarp_lib.py |  77 +++++++++
 2 files changed, 254 insertions(+), 25 deletions(-)

diff --git a/alg/gdalwarpkernel.cpp b/alg/gdalwarpkernel.cpp
index 9053cb24cddf..f6c10a08df84 100644
--- a/alg/gdalwarpkernel.cpp
+++ b/alg/gdalwarpkernel.cpp
@@ -2849,11 +2849,14 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK,
 // or http://en.wikipedia.org/wiki/Cubic_Hermite_spline : CINTx(p_1,p0,p1,p2)
 // http://en.wikipedia.org/wiki/Bicubic_interpolation: matrix notation
 
-// TODO(schwehr): Use an inline function.
-#define CubicConvolution(distance1, distance2, distance3, f0, f1, f2, f3)      \
-    (f1 + 0.5 * (distance1 * (f2 - f0) +                                       \
-                 distance2 * (2.0 * f0 - 5.0 * f1 + 4.0 * f2 - f3) +           \
-                 distance3 * (3.0 * (f1 - f2) + f3 - f0)))
+template <typename T>
+static inline T CubicConvolution(T distance1, T distance2, T distance3, T f0,
+                                 T f1, T f2, T f3)
+{
+    return (f1 + T(0.5) * (distance1 * (f2 - f0) +
+                           distance2 * (2 * f0 - 5 * f1 + 4 * f2 - f3) +
+                           distance3 * (3 * (f1 - f2) + f3 - f0)));
+}
 
 /************************************************************************/
 /*                       GWKCubicComputeWeights()                       */
@@ -2861,19 +2864,18 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK,
 
 // adfCoeffs[2] = 1.0 - (adfCoeffs[0] + adfCoeffs[1] - adfCoeffs[3]);
 
-// TODO(schwehr): Use an inline function.
-#define GWKCubicComputeWeights(dfX_, adfCoeffs)                                \
-    {                                                                          \
-        const double dfX = dfX_;                                               \
-        const double dfHalfX = 0.5 * dfX;                                      \
-        const double dfThreeX = 3.0 * dfX;                                     \
-        const double dfHalfX2 = dfHalfX * dfX;                                 \
-                                                                               \
-        adfCoeffs[0] = dfHalfX * (-1 + dfX * (2 - dfX));                       \
-        adfCoeffs[1] = 1 + dfHalfX2 * (-5 + dfThreeX);                         \
-        adfCoeffs[2] = dfHalfX * (1 + dfX * (4 - dfThreeX));                   \
-        adfCoeffs[3] = dfHalfX2 * (-1 + dfX);                                  \
-    }
+template <typename T>
+static inline void GWKCubicComputeWeights(T x, T coeffs[4])
+{
+    const T halfX = T(0.5) * x;
+    const T threeX = T(3.0) * x;
+    const T halfX2 = halfX * x;
+
+    coeffs[0] = halfX * (-1 + x * (2 - x));
+    coeffs[1] = 1 + halfX2 * (-5 + threeX);
+    coeffs[2] = halfX * (1 + x * (4 - threeX));
+    coeffs[3] = halfX2 * (-1 + x);
+}
 
 // TODO(schwehr): Use an inline function.
 #define CONVOL4(v1, v2)                                                        \
@@ -2969,10 +2971,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand,
     return true;
 }
 
-// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
-// perf benefit.
-
-#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
+#if defined(__x86_64) || defined(_M_X64)
 
 /************************************************************************/
 /*                           XMMLoad4Values()                           */
@@ -2985,7 +2984,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
 {
     unsigned int i;
     memcpy(&i, ptr, 4);
-    __m128i xmm_i = _mm_cvtsi32_si128(s);
+    __m128i xmm_i = _mm_cvtsi32_si128(i);
     // Zero extend 4 packed unsigned 8-bit integers in a to packed
     // 32-bit integers.
 #if __SSE4_1__
@@ -3001,7 +3000,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
 {
     GUInt64 i;
     memcpy(&i, ptr, 8);
-    __m128i xmm_i = _mm_cvtsi64_si128(s);
+    __m128i xmm_i = _mm_cvtsi64_si128(i);
     // Zero extend 4 packed unsigned 16-bit integers in a to packed
     // 32-bit integers.
 #if __SSE4_1__
@@ -3038,7 +3037,7 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
 }
 #endif
 
-#endif  // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
+#endif  // (defined(__x86_64) || defined(_M_X64))
 
 /************************************************************************/
 /*            GWKCubicResampleSrcMaskIsDensity4SampleRealT()            */
@@ -3046,6 +3045,8 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
 
 // Note: if USE_SSE_CUBIC_IMPL, only instantiate that for Byte and UInt16,
 // because there are a few assumptions above those types.
+// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
+// perf benefit.
 
 template <class T>
 static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
@@ -5853,6 +5854,141 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)
     return GWKRun(poWK, "GWKRealCase", GWKRealCaseThread);
 }
 
+/************************************************************************/
+/*                 GWKCubicResampleNoMasks4MultiBandT()                 */
+/************************************************************************/
+
+/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
+/* and enough SSE registries */
+#if defined(__x86_64) || defined(_M_X64)
+
+static inline float Convolute4x4(const __m128 row0, const __m128 row1,
+                                 const __m128 row2, const __m128 row3,
+                                 const __m128 weightsXY0,
+                                 const __m128 weightsXY1,
+                                 const __m128 weightsXY2,
+                                 const __m128 weightsXY3)
+{
+    return XMMHorizontalAdd(_mm_add_ps(
+        _mm_add_ps(_mm_mul_ps(row0, weightsXY0), _mm_mul_ps(row1, weightsXY1)),
+        _mm_add_ps(_mm_mul_ps(row2, weightsXY2),
+                   _mm_mul_ps(row3, weightsXY3))));
+}
+
+template <class T>
+static void GWKCubicResampleNoMasks4MultiBandT(const GDALWarpKernel *poWK,
+                                               double dfSrcX, double dfSrcY,
+                                               const GPtrDiff_t iDstOffset)
+{
+    const double dfSrcXShifted = dfSrcX - 0.5;
+    const int iSrcX = static_cast<int>(dfSrcXShifted);
+    const double dfSrcYShifted = dfSrcY - 0.5;
+    const int iSrcY = static_cast<int>(dfSrcYShifted);
+    const GPtrDiff_t iSrcOffset =
+        iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
+
+    // Get the bilinear interpolation at the image borders.
+    if (iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize || iSrcY - 1 < 0 ||
+        iSrcY + 2 >= poWK->nSrcYSize)
+    {
+        for (int iBand = 0; iBand < poWK->nBands; iBand++)
+        {
+            T value;
+            GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
+                                               &value);
+            reinterpret_cast<T *>(poWK->papabyDstImage[iBand])[iDstOffset] =
+                value;
+        }
+    }
+    else
+    {
+        const float fDeltaX = static_cast<float>(dfSrcXShifted) - iSrcX;
+        const float fDeltaY = static_cast<float>(dfSrcYShifted) - iSrcY;
+
+        float afCoeffsX[4];
+        float afCoeffsY[4];
+        GWKCubicComputeWeights(fDeltaX, afCoeffsX);
+        GWKCubicComputeWeights(fDeltaY, afCoeffsY);
+        const auto weightsX = _mm_loadu_ps(afCoeffsX);
+        const auto weightsXY0 =
+            _mm_mul_ps(_mm_load1_ps(&afCoeffsY[0]), weightsX);
+        const auto weightsXY1 =
+            _mm_mul_ps(_mm_load1_ps(&afCoeffsY[1]), weightsX);
+        const auto weightsXY2 =
+            _mm_mul_ps(_mm_load1_ps(&afCoeffsY[2]), weightsX);
+        const auto weightsXY3 =
+            _mm_mul_ps(_mm_load1_ps(&afCoeffsY[3]), weightsX);
+
+        const GPtrDiff_t iOffset = iSrcOffset - poWK->nSrcXSize - 1;
+
+        int iBand = 0;
+        // Process 2 bands at a time
+        for (; iBand + 1 < poWK->nBands; iBand += 2)
+        {
+            const T *CPL_RESTRICT pBand0 =
+                reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
+            const auto row0_0 = XMMLoad4Values(pBand0 + iOffset);
+            const auto row1_0 =
+                XMMLoad4Values(pBand0 + iOffset + poWK->nSrcXSize);
+            const auto row2_0 =
+                XMMLoad4Values(pBand0 + iOffset + 2 * poWK->nSrcXSize);
+            const auto row3_0 =
+                XMMLoad4Values(pBand0 + iOffset + 3 * poWK->nSrcXSize);
+
+            const T *CPL_RESTRICT pBand1 =
+                reinterpret_cast<const T *>(poWK->papabySrcImage[iBand + 1]);
+            const auto row0_1 = XMMLoad4Values(pBand1 + iOffset);
+            const auto row1_1 =
+                XMMLoad4Values(pBand1 + iOffset + poWK->nSrcXSize);
+            const auto row2_1 =
+                XMMLoad4Values(pBand1 + iOffset + 2 * poWK->nSrcXSize);
+            const auto row3_1 =
+                XMMLoad4Values(pBand1 + iOffset + 3 * poWK->nSrcXSize);
+
+            const float fValue_0 =
+                Convolute4x4(row0_0, row1_0, row2_0, row3_0, weightsXY0,
+                             weightsXY1, weightsXY2, weightsXY3);
+
+            const float fValue_1 =
+                Convolute4x4(row0_1, row1_1, row2_1, row3_1, weightsXY0,
+                             weightsXY1, weightsXY2, weightsXY3);
+
+            T *CPL_RESTRICT pDstBand0 =
+                reinterpret_cast<T *>(poWK->papabyDstImage[iBand]);
+            pDstBand0[iDstOffset] = GWKClampValueT<T>(fValue_0);
+
+            T *CPL_RESTRICT pDstBand1 =
+                reinterpret_cast<T *>(poWK->papabyDstImage[iBand + 1]);
+            pDstBand1[iDstOffset] = GWKClampValueT<T>(fValue_1);
+        }
+        if (iBand < poWK->nBands)
+        {
+            const T *CPL_RESTRICT pBand0 =
+                reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
+            const auto row0 = XMMLoad4Values(pBand0 + iOffset);
+            const auto row1 =
+                XMMLoad4Values(pBand0 + iOffset + poWK->nSrcXSize);
+            const auto row2 =
+                XMMLoad4Values(pBand0 + iOffset + 2 * poWK->nSrcXSize);
+            const auto row3 =
+                XMMLoad4Values(pBand0 + iOffset + 3 * poWK->nSrcXSize);
+
+            const float fValue =
+                Convolute4x4(row0, row1, row2, row3, weightsXY0, weightsXY1,
+                             weightsXY2, weightsXY3);
+
+            T *CPL_RESTRICT pDstBand =
+                reinterpret_cast<T *>(poWK->papabyDstImage[iBand]);
+            pDstBand[iDstOffset] = GWKClampValueT<T>(fValue);
+        }
+    }
+
+    if (poWK->pafDstDensity)
+        poWK->pafDstDensity[iDstOffset] = 1.0f;
+}
+
+#endif  // defined(__x86_64) || defined(_M_X64)
+
 /************************************************************************/
 /*                GWKResampleNoMasksOrDstDensityOnlyThreadInternal()    */
 /************************************************************************/
@@ -5957,6 +6093,22 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
             const GPtrDiff_t iDstOffset =
                 iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
 
+#if defined(__x86_64) || defined(_M_X64)
+            if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
+                          (std::is_same<T, GByte>::value ||
+                           std::is_same<T, GUInt16>::value))
+            {
+                if (poWK->nBands > 1 && !poWK->bApplyVerticalShift)
+                {
+                    GWKCubicResampleNoMasks4MultiBandT<T>(
+                        poWK, padfX[iDstX] - poWK->nSrcXOff,
+                        padfY[iDstX] - poWK->nSrcYOff, iDstOffset);
+
+                    continue;
+                }
+            }
+#endif  // defined(__x86_64) || defined(_M_X64)
+
             [[maybe_unused]] double dfInvWeights = 0;
             for (int iBand = 0; iBand < poWK->nBands; iBand++)
             {
diff --git a/autotest/utilities/test_gdalwarp_lib.py b/autotest/utilities/test_gdalwarp_lib.py
index 3cb97c04aee6..d89a24d80dcd 100755
--- a/autotest/utilities/test_gdalwarp_lib.py
+++ b/autotest/utilities/test_gdalwarp_lib.py
@@ -4330,3 +4330,80 @@ def test_gdalwarp_lib_src_is_geog_arc_second():
     assert out_ds.RasterXSize == 5464
     assert out_ds.RasterYSize == 5464
     assert out_ds.GetRasterBand(1).Checksum() == 31856
+
+
+###############################################################################
+# Test GWKCubicResampleNoMasks4MultiBandT<Byte>()
+
+
+def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim():
+
+    src_ds = gdal.Open("../gdrivers/data/small_world.tif")
+
+    # RGB only
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
+    )
+    assert out_ds.RasterXSize == 21
+    assert out_ds.RasterYSize == 21
+    assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
+        4785,
+        4689,
+        5007,
+    ]
+
+    # With dest alpha
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic -dstalpha",
+    )
+    assert out_ds.RasterXSize == 21
+    assert out_ds.RasterYSize == 21
+    assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
+        4785,
+        4689,
+        5007,
+    ]
+    assert out_ds.GetRasterBand(4).ComputeRasterMinMax() == (255, 255)
+
+    # Test edge effects
+    # (slightly change the target resolution so that the nearest approximation
+    # doesn't kick in)
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -r cubic -tr 0.9000001 0.9000001 -wo XSCALE=1 -wo YSCALE=1",
+    )
+    assert out_ds.RasterXSize == 400
+    assert out_ds.RasterYSize == 200
+    assert out_ds.ReadRaster() == src_ds.ReadRaster()
+
+
+###############################################################################
+# Test GWKCubicResampleNoMasks4MultiBandT<GUInt16>()
+
+
+def test_gdalwarp_lib_cubic_multiband_uint16_4sample_optim():
+
+    src_ds = gdal.Open("../gdrivers/data/small_world.tif")
+    src_ds = gdal.Translate(
+        "", src_ds, options="-f MEM -ot UInt16 -scale 0 255 0 65535"
+    )
+
+    # RGB only
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
+    )
+    out_ds = gdal.Translate("", out_ds, options="-f MEM -ot Byte -scale 0 65535 0 255")
+    assert out_ds.RasterXSize == 21
+    assert out_ds.RasterYSize == 21
+    assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
+        4785,
+        4689,
+        5007,
+    ]