diff --git a/NOTICE.md b/NOTICE.md index 62341fc..31b571e 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -4,62 +4,26 @@ This repository incorporates material as listed below or described in the code. #### Component. -GauXC +torch scatter in src/skala/utils/scatter.py #### Open Source License/Copyright Notice. -GauXC Copyright (c) 2020, The Regents of the University of California, -through Lawrence Berkeley National Laboratory (subject to receipt of -any required approvals from the U.S. Dept. of Energy). All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -(1) Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -(2) Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -(3) Neither the name of the University of California, Lawrence Berkeley -National Laboratory, U.S. Dept. of Energy nor the names of its contributors -may be used to endorse or promote products derived from this software -without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -You are under no obligation whatsoever to provide any bug fixes, patches, -or upgrades to the features, functionality or performance of the source -code ("Enhancements") to anyone; however, if you choose to make your -Enhancements available either publicly, or directly to Lawrence Berkeley -National Laboratory, without imposing a separate written license agreement -for such Enhancements, then you hereby grant the following license: a -non-exclusive, royalty-free perpetual license to install, use, modify, -prepare derivative works, incorporate into other computer software, -distribute, and sublicense such enhancements or derivative works thereof, -in binary and source code form. - -#### Additional Attribution. - -Primary Developer and Maintainer: David Williams--Young - LBNL (dbwy at lbl dot gov) - -GauXC has received major contributions from the following developers (in no particular order): - -Thom Popovici (LBNL) - Optimized sn-K kernels for CPU and GPU architectures -Teri Lambros (UW) - Unrestricted (UKS) and Generalized (GKS) DFT -Daniel Mejia-Rodriguez (PNNL) - Meta-GGA DFT -We have also receieved significant support from industry collaborators: - -David Clark (NVIDIA) - Optimization of critical kernels for NVIDIA architectures -Damon McDougall (AMD) - Optimization of critical kernels for AMDGPU architectures +Copyright (c) 2020 Matthias Fey + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md index 24c4c2c..24375f4 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,10 @@ Learn more about Skala in our [ArXiv paper](https://arxiv.org/abs/2506.14665). This repository contains three main components: 1. The Python package `microsoft-skala`, which is also distributed [on PyPI](https://pypi.org/project/microsoft-skala/) and contains a Pytorch implementation of the Skala model, its hookups to quantum chemistry packages [PySCF](https://pyscf.org/) and [ASE](https://ase-lib.org/), and an independent client library for the Skala model served [in Azure AI Foundry](https://ai.azure.com/catalog/models/Skala). -2. A development version of the CPU/GPU C++ library for XC functionals [GauXC](https://github.com/wavefunction91/GauXC) with an add-on supporting Pytorch-based functionals like Skala. GauXC is part of the stack that serves Skala in Azure AI Foundry and can be used to integrate Skala into other third-party DFT codes. +2. A development version of the CPU/GPU C++ library for XC functionals [GauXC](https://github.com/wavefunction91/GauXC/tree/skala) with an add-on supporting Pytorch-based functionals like Skala. GauXC is part of the stack that serves Skala in Azure AI Foundry and can be used to integrate Skala into other third-party DFT codes. 3. An example of using Skala in C++ CPU applications through LibTorch, see [`examples/cpp/cpp_integration`](examples/cpp/cpp_integration). -All information below relates to the Python package, the development version of GauXC including its license and other information can be found in [`third_party/gauxc`](https://github.com/microsoft/skala/tree/main/third_party/gauxc). +All information below relates to the Python package, the development version of GauXC including its license and other information can be found in the [`skala` branch of the GauXC repository](https://github.com/wavefunction91/GauXC/tree/skala). ## Getting started diff --git a/docs/index.rst b/docs/index.rst index 4cfc352..950591d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,4 +34,4 @@ Please stay tuned for updates and new releases. :hidden: Skala preprint - Breaking bonds, breaking ground \ No newline at end of file + Breaking bonds, breaking ground diff --git a/src/skala/foundry/__init__.py b/src/skala/foundry/__init__.py index f43375c..4d4691f 100644 --- a/src/skala/foundry/__init__.py +++ b/src/skala/foundry/__init__.py @@ -1,2 +1,4 @@ +# SPDX-License-Identifier: MIT + from skala.foundry.client import SkalaFoundryClient # noqa: F401 from skala.foundry.schemas import SkalaConfig, SkalaInput, SkalaOutput # noqa: F401 diff --git a/src/skala/foundry/client.py b/src/skala/foundry/client.py index 63f7a55..0df7d8a 100644 --- a/src/skala/foundry/client.py +++ b/src/skala/foundry/client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: MIT + import json import logging import time diff --git a/src/skala/foundry/schemas.py b/src/skala/foundry/schemas.py index 8225f0a..d5414b3 100644 --- a/src/skala/foundry/schemas.py +++ b/src/skala/foundry/schemas.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: MIT + from typing import Literal, TypeAlias import numpy as np diff --git a/src/skala/functional/model.py b/src/skala/functional/model.py index c089137..72eb356 100644 --- a/src/skala/functional/model.py +++ b/src/skala/functional/model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: MIT + import math import torch diff --git a/src/skala/pyscf/backend.py b/src/skala/pyscf/backend.py index 0cbc9d1..23987ef 100644 --- a/src/skala/pyscf/backend.py +++ b/src/skala/pyscf/backend.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: MIT + from typing import ( TYPE_CHECKING, TypeAlias, diff --git a/src/skala/utils/scatter.py b/src/skala/utils/scatter.py index b36fc08..b24b1b0 100644 --- a/src/skala/utils/scatter.py +++ b/src/skala/utils/scatter.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: MIT # A copy of useful code from torch scatter # https://github.com/rusty1s/pytorch_scatter/blob/96aa2e3587123ba4ef31820899d5e62141e9a4c2/torch_scatter/scatter.py diff --git a/third_party/gauxc/.github/workflows/build_and_test_compiler_zoo.yml b/third_party/gauxc/.github/workflows/build_and_test_compiler_zoo.yml deleted file mode 100644 index e28b353..0000000 --- a/third_party/gauxc/.github/workflows/build_and_test_compiler_zoo.yml +++ /dev/null @@ -1,273 +0,0 @@ -name: Build and Test - Compiler Zoo - -on: [pull_request, workflow_dispatch] - -env: - GH_ACTIONS_TOOLCHAIN: .github/workflows/toolchains/gh-actions.cmake - ENV_PREFIX_PATH: "/home/software/install/blis-lp64-sequential;/usr/local/libxc/exchcxx-patch" - -jobs: - release_build: - name: Release Build and Test - runs-on: ubuntu-latest - container: - image: dbwy/chemistry - strategy: - matrix: - compiler: - - {suite: gnu, version: 12} - - {suite: llvm, version: 19} - mpi_flag: [ON, OFF] - openmp_flag: [ON, OFF] - exclude: - - compiler: {suite: llvm, version: 19} - openmp_flag: ON - - steps: - - uses: actions/checkout@v4 - - - name: Install LLVM toolchain - if: ${{ matrix.compiler.suite == 'llvm' }} - shell: bash - run: | - set -euo pipefail - apt-get update - apt-get install -y wget gnupg lsb-release software-properties-common - wget https://apt.llvm.org/llvm.sh - chmod +x llvm.sh - ./llvm.sh ${{ matrix.compiler.version }} -y - - - name: Setup Compiler - shell: bash - run: $GITHUB_WORKSPACE/.github/workflows/scripts/compiler_setup.sh - ${{matrix.compiler.suite}} ${{matrix.compiler.version}} - - - name: Enable or Disable MPI - shell: bash - run: echo "set(GAUXC_ENABLE_MPI ${{matrix.mpi_flag}} CACHE BOOL \"\" FORCE)" >> - ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Enable or Disable OpenMP - shell: bash - run: echo "set(GAUXC_ENABLE_OPENMP ${{matrix.openmp_flag}} CACHE BOOL \"\" FORCE)" >> - ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Setup Build Type - shell: bash - run: echo "set(CMAKE_BUILD_TYPE Release CACHE BOOL \"\" FORCE)" >> - ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Configure CMake - shell: bash - run: cmake -S $GITHUB_WORKSPACE -B ${{runner.workspace}}/build - -DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install - -DCMAKE_PREFIX_PATH=${ENV_PREFIX_PATH} - -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Build - shell: bash - run: cmake --build ${{runner.workspace}}/build -j2 - - - name: Test - shell: bash - run: cmake --build ${{runner.workspace}}/build --target test - - nvidia_build: - name: NVIDIA Build (No Test) - runs-on: ubuntu-latest - #needs: release_build - container: - image: dbwy/chemistry-gpu - #options: --gpus all - strategy: - matrix: - flags: [ {magma: OFF, cutlass: OFF}, {magma: ON, cutlass: OFF}, {magma: OFF, cutlass: ON} ] - - steps: - - uses: actions/checkout@v4 - - - name: Setup Build Type - shell: bash - run: echo "set(CMAKE_BUILD_TYPE Release CACHE BOOL \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Setup Enable CUDA - shell: bash - run: | - echo "set(GAUXC_ENABLE_CUDA ON CACHE BOOL \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - echo "set(CMAKE_CUDA_ARCHITECTURES 80 CACHE STRING \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - echo "set(GAUXC_ENABLE_MAGMA ${{matrix.flags.magma}} CACHE BOOL \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - echo "set(GAUXC_ENABLE_CUTLASS ${{matrix.flags.cutlass}} CACHE BOOL \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - echo "set(GAUXC_ENABLE_MPI OFF CACHE BOOL \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - echo "set(MAGMA_ROOT_DIR \"/usr/local/magma/2.6.2/install\" CACHE PATH \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Configure CMake - shell: bash - run: cmake -S $GITHUB_WORKSPACE -B ${{runner.workspace}}/build - -DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install - -DCMAKE_PREFIX_PATH=${ENV_PREFIX_PATH} - -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Build - shell: bash - run: cmake --build ${{runner.workspace}}/build -j2 - - #- name: Test - # shell: bash - # run: cmake --build ${{runner.workspace}}/build --target test - - debug_build: - name: Debug Build and Test - runs-on: ubuntu-latest - container: - image: dbwy/chemistry - - steps: - - uses: actions/checkout@v4 - - - name: Setup Compiler - shell: bash - run: $GITHUB_WORKSPACE/.github/workflows/scripts/compiler_setup.sh gnu 12 - - - name: Setup Build Type - shell: bash - run: echo "set(CMAKE_BUILD_TYPE Debug CACHE BOOL \"\" FORCE)" >> ${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Configure CMake - shell: bash - run: cmake -S $GITHUB_WORKSPACE -B ${{runner.workspace}}/build - -DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install - -DCMAKE_PREFIX_PATH=${ENV_PREFIX_PATH} - -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Build - shell: bash - run: cmake --build ${{runner.workspace}}/build -j2 - - - name: Test - shell: bash - run: cmake --build ${{runner.workspace}}/build --target test - - subproject_build: - name: Build as Subproject - needs: release_build - runs-on: ubuntu-latest - container: - image: dbwy/chemistry - - steps: - - uses: actions/checkout@v4 - - - name: Setup Compiler - shell: bash - run: $GITHUB_WORKSPACE/.github/workflows/scripts/compiler_setup.sh gnu 12 - - - name: CMake Subproject Configure - shell: bash - run: cmake -S $GITHUB_WORKSPACE/tests/cmake/subproject - -B ${{runner.workspace}}/cmake_subproject_build - -DGITHUB_REPOSITORY=$GITHUB_ACTOR/GauXC - -DGIT_REVISION=$GITHUB_HEAD_REF - -DFETCHCONTENT_SOURCE_DIR_GAUXC=$GITHUB_WORKSPACE - -DCMAKE_PREFIX_PATH=${ENV_PREFIX_PATH} - -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: CMake Subproject Build - shell: bash - run: cmake --build ${{runner.workspace}}/cmake_subproject_build -j2 - - cmake_discovery: - name: CMake Discovery - needs: release_build - runs-on: ubuntu-latest - container: - image: dbwy/chemistry - - steps: - - uses: actions/checkout@v4 - - - name: Setup Compiler - shell: bash - run: $GITHUB_WORKSPACE/.github/workflows/scripts/compiler_setup.sh gnu 12 - - - name: Configure CMake - shell: bash - run: cmake -S $GITHUB_WORKSPACE -B ${{runner.workspace}}/build - -DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install - -DCMAKE_PREFIX_PATH=${ENV_PREFIX_PATH} - -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: Build - shell: bash - run: cmake --build ${{runner.workspace}}/build -j2 - - - name: Install - shell: bash - run: cmake --build ${{runner.workspace}}/build --target install - - - name: CMake Discovery Configure - shell: bash - run: cmake -S $GITHUB_WORKSPACE/tests/cmake/discovery -B ${{runner.workspace}}/cmake_discovery_build - -DCMAKE_PREFIX_PATH="${{runner.workspace}}/install;${ENV_PREFIX_PATH}" - -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/${GH_ACTIONS_TOOLCHAIN} - - - name: CMake Discovery Build - shell: bash - run: cmake --build ${{runner.workspace}}/cmake_discovery_build -j2 - - macos_build: - name: macOS Build and Test - runs-on: macos-14 - - steps: - - uses: actions/checkout@v4 - - - uses: mamba-org/setup-micromamba@v2 - with: - environment-name: gauxc - create-args: >- - python=3.11 - c-compiler - cxx-compiler - fortran-compiler - mpich - cmake - hdf5 - openblas - ccache - init-shell: bash - cache-environment: true - - - name: Setup ccache - shell: micromamba-shell {0} - run: | - ccache --set-config=max_size=2G - ccache --set-config=compression=true - echo "CMAKE_C_COMPILER_LAUNCHER=ccache" >> $GITHUB_ENV - echo "CMAKE_CXX_COMPILER_LAUNCHER=ccache" >> $GITHUB_ENV - - - name: Restore ccache - uses: actions/cache@v4 - with: - path: ~/.ccache - key: ccache-macos-${{ github.sha }} - restore-keys: | - ccache-macos- - - - name: Build - shell: micromamba-shell {0} - run: | - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=Release \ - -DGAUXC_ENABLE_MPI=ON \ - -DGAUXC_ENABLE_TESTS=ON \ - -DBUILD_TESTING=ON \ - -DCMAKE_POLICY_VERSION_MINIMUM=3.5 - cmake --build build -j3 - - - name: ccache statistics - shell: micromamba-shell {0} - run: ccache --show-stats - - - name: Test - shell: micromamba-shell {0} - run: ctest --test-dir build --output-on-failure diff --git a/third_party/gauxc/.github/workflows/scripts/compiler_setup.sh b/third_party/gauxc/.github/workflows/scripts/compiler_setup.sh deleted file mode 100755 index 467ed0f..0000000 --- a/third_party/gauxc/.github/workflows/scripts/compiler_setup.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -export CSUITE=$1 -export CVER=$2 - -if [[ "${CSUITE}" == "llvm" ]] -then - # register the specific clang version as an alternative (needed once per version) - update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${CVER} 50 - update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${CVER} 50 - update-alternatives --set clang /usr/bin/clang-${CVER} - update-alternatives --set clang++ /usr/bin/clang++-${CVER} - update-alternatives --install /usr/bin/cc cc /usr/bin/clang 30 - update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++ 30 -elif [[ "${CSUITE}" == "gnu" ]] -then - update-alternatives --set gcc /usr/bin/gcc-${CVER} - update-alternatives --set g++ /usr/bin/g++-${CVER} - update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30 - update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30 -else - echo "Compiler Suite Not Recognized!" - exit 125 -fi - -echo "Selected compilers:" -echo " cc -> $(command -v cc) | $(cc --version | head -1)" -echo " c++ -> $(command -v c++) | $(c++ --version | head -1)" diff --git a/third_party/gauxc/.github/workflows/toolchains/gh-actions.cmake b/third_party/gauxc/.github/workflows/toolchains/gh-actions.cmake deleted file mode 100644 index 68607d8..0000000 --- a/third_party/gauxc/.github/workflows/toolchains/gh-actions.cmake +++ /dev/null @@ -1,5 +0,0 @@ -set( CMAKE_C_COMPILER cc ) -set( CMAKE_CXX_COMPILER c++ ) - -set(CMAKE_CXX_FLAGS_INIT "-march=native") -set(CMAKE_C_FLAGS_INIT "-march=native") diff --git a/third_party/gauxc/.gitignore b/third_party/gauxc/.gitignore deleted file mode 100644 index be531ab..0000000 --- a/third_party/gauxc/.gitignore +++ /dev/null @@ -1,14 +0,0 @@ -*pycache** -src/xc_integrator/local_work_driver/host/obara_saika/src/*.o -src/xc_integrator/local_work_driver/host/obara_saika/*.a -src/xc_integrator/local_work_driver/host/obara_saika/test/*.o -src/xc_integrator/local_work_driver/host/obara_saika/test/*.x -src/xc_integrator/local_work_driver/host/obara_saika/generator/integral* -src/xc_integrator/local_work_driver/host/obara_saika/generator/obara* -src/xc_integrator/local_work_driver/host/obara_saika/generator/*.x -*.swp - -# Build directories -build/ -_build/ -cmake-build-*/ diff --git a/third_party/gauxc/CMakeLists.txt b/third_party/gauxc/CMakeLists.txt deleted file mode 100644 index 0c148a9..0000000 --- a/third_party/gauxc/CMakeLists.txt +++ /dev/null @@ -1,132 +0,0 @@ -cmake_minimum_required( VERSION 3.20 FATAL_ERROR ) - -include(FetchContent) -set( FETCHCONTENT_UPDATES_DISCONNECTED ON CACHE BOOL "Disable FC Updates" ) - -project( GauXC VERSION 1.0.0 LANGUAGES C CXX ) - -# Place local modules in the path -list( PREPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ) -list( PREPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules ) -include( gauxc-linalg-modules ) - -# Guard some options settings to only default when not a subproject -if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) - # Populate BUILD_TESTING prior to dependencies to avoid clash - include(CTest) - - # Default the built type - if( NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES ) - set( CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "Choose the type of build" FORCE ) - # Set the possible values of build type for cmake-gui - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS - "Debug" "Release" "MinSizeRel" "RelWithDebInfo") - endif() -endif() - - - -# GauXC Options -option( GAUXC_ENABLE_HOST "Enable Host Integrator" ON ) -option( GAUXC_ENABLE_CUDA "Enable CUDA Bindings" OFF ) -option( GAUXC_ENABLE_HIP "Enable HIP Bindings" OFF ) -option( GAUXC_ENABLE_MPI "Enable MPI Bindings" ON ) -option( GAUXC_ENABLE_OPENMP "Enable OpenMP Compilation" ON ) -option( GAUXC_ENABLE_TESTS "Enable Unit Tests" ON ) -option( GAUXC_ENABLE_GAU2GRID "Enable Gau2Grid Collocation" ON ) -option( GAUXC_ENABLE_HDF5 "Enable HDF5 Bindings" ON ) -option( GAUXC_ENABLE_ONEDFT "Enable ONEDFT Functional" ON ) -option( GAUXC_USE_FAST_RSQRT "Enable Fast RSQRT" OFF ) -option( GAUXC_BLAS_PREFER_ILP64 "Prefer ILP64 for host BLAS" OFF ) -option( GAUXC_LINK_CUDA_STATIC "Link GauXC with static CUDA libs" OFF ) - -include(CMakeDependentOption) -cmake_dependent_option( GAUXC_ENABLE_MAGMA - "Enable MAGMA Linear Algebra" ON - "GAUXC_ENABLE_CUDA OR GAUXC_ENABLE_HIP" OFF -) -cmake_dependent_option( GAUXC_ENABLE_NCCL - "Enable NCCL Collectives" OFF - "GAUXC_ENABLE_CUDA" OFF -) -cmake_dependent_option( GAUXC_ENABLE_CUTLASS - "Enable CUTLASS Linear Algebra" OFF - "GAUXC_ENABLE_CUDA" OFF -) - -# Default the feature variables -set( GAUXC_HAS_HOST FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_CUDA FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_HIP FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_MPI FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_OPENMP FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_GAU2GRID FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_HDF5 FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_MAGMA FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_NCCL FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_CUTLASS FALSE CACHE BOOL "" FORCE ) -set( GAUXC_HAS_ONEDFT FALSE CACHE BOOL "" FORCE ) -set( GAUXC_BLAS_IS_LP64 FALSE CACHE BOOL "" FORCE ) - -mark_as_advanced( FORCE - GAUXC_HAS_HOST - GAUXC_HAS_CUDA - GAUXC_HAS_HIP - GAUXC_HAS_MPI - GAUXC_HAS_OPENMP - GAUXC_HAS_GAU2GRID - GAUXC_HAS_HDF5 - GAUXC_HAS_MAGMA - GAUXC_HAS_NCCL - GAUXC_HAS_CUTLASS - GAUXC_HAS_ONEDFT - GAUXC_BLAS_IS_LP64 -) - - -if( NOT GAUXC_ENABLE_GAU2GRID ) - message( FATAL_ERROR "Gau2Grid is currently a required dependency which - will be made optional in a future release of GauXC [WIP]" ) -endif() - - -if( GAUXC_ENABLE_HOST ) - set(GAUXC_HAS_HOST TRUE CACHE BOOL "GauXC has Host Bindings" FORCE) -endif() - -if( GAUXC_ENABLE_CUDA ) - enable_language( CUDA ) - set( GAUXC_HAS_CUDA TRUE CACHE BOOL "GauXC has CUDA and will build CUDA bindings" FORCE ) -endif() - -if( GAUXC_ENABLE_HIP ) - enable_language( HIP ) - set( GAUXC_HAS_HIP TRUE CACHE BOOL "GauXC has HIP and will build HIP bindings" FORCE ) -endif() - -# Decided if we're compiling device bindings -if( GAUXC_HAS_CUDA OR GAUXC_HAS_HIP ) - set( GAUXC_HAS_DEVICE TRUE CACHE BOOL "Enable Device Code" ) -else() - set( GAUXC_HAS_DEVICE FALSE CACHE BOOL "Enable Device Code" ) -endif() - - - -if( NOT (${GAUXC_HAS_HOST} OR ${GAUXC_HAS_DEVICE}) ) - message( FATAL_ERROR "Neither Host nor Device Integrators have been enabled!" ) -endif() - - -add_subdirectory( src ) - -if( CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND GAUXC_ENABLE_TESTS AND BUILD_TESTING ) - add_subdirectory( tests ) -endif() - -list(REMOVE_AT CMAKE_MODULE_PATH 0) -list(REMOVE_AT CMAKE_MODULE_PATH 0) - -if( linalg-cmake-modules_POPULATED ) - list(REMOVE_AT CMAKE_MODULE_PATH 0) -endif() diff --git a/third_party/gauxc/CODE_OF_CONDUCT.md b/third_party/gauxc/CODE_OF_CONDUCT.md deleted file mode 100644 index 686e5e7..0000000 --- a/third_party/gauxc/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,10 +0,0 @@ -# Microsoft Open Source Code of Conduct - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). - -Resources: - -- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) -- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns -- Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support) diff --git a/third_party/gauxc/CONTRIBUTING.md b/third_party/gauxc/CONTRIBUTING.md deleted file mode 100644 index ebf23ac..0000000 --- a/third_party/gauxc/CONTRIBUTING.md +++ /dev/null @@ -1,14 +0,0 @@ -# Contributing - -This project welcomes contributions and suggestions. Most contributions require you to -agree to a Contributor License Agreement (CLA) declaring that you have the right to, -and actually do, grant us the rights to use your contribution. For details, visit -https://cla.microsoft.com. - -When you submit a pull request, a CLA-bot will automatically determine whether you need -to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the -instructions provided by the bot. You will only need to do this once across all repositories using our CLA. - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/third_party/gauxc/CONTRIBUTORS.md b/third_party/gauxc/CONTRIBUTORS.md deleted file mode 100644 index 689d4e6..0000000 --- a/third_party/gauxc/CONTRIBUTORS.md +++ /dev/null @@ -1,17 +0,0 @@ -# This is the list of GauXC's significant contributors. -# -# This does not necessarily list everyone who has contributed code. -# To see the full list of contributors, see the revision history in -# source control. - -Primary Developer and Maintainer: David Williams--Young - Microsoft (davidwillia at microsoft dot com) - -* Thom Popovici (LBNL) -* Teri Lambros (UW) -* Mikael Kovtun (UW) -* Daniel Mejia-Rodriguez (PNNL) - -* Yingrong Chen (Microsoft) -* Jiashu Liang (Microsoft) -* David Clark (NVIDIA) -* Damon McDougall (AMD) diff --git a/third_party/gauxc/LICENSE.txt b/third_party/gauxc/LICENSE.txt deleted file mode 100644 index f2904da..0000000 --- a/third_party/gauxc/LICENSE.txt +++ /dev/null @@ -1,46 +0,0 @@ -GauXC Copyright (c) 2020-2024, The Regents of the University of California, -through Lawrence Berkeley National Laboratory (subject to receipt of -any required approvals from the U.S. Dept. of Energy). - -(c) 2024-2025, Microsoft Corporation - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -(1) Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -(2) Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -(3) Neither the name of the University of California, Lawrence Berkeley -National Laboratory, U.S. Dept. of Energy nor the names of its contributors -may be used to endorse or promote products derived from this software -without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -You are under no obligation whatsoever to provide any bug fixes, patches, -or upgrades to the features, functionality or performance of the source -code ("Enhancements") to anyone; however, if you choose to make your -Enhancements available either publicly, or directly to Lawrence Berkeley -National Laboratory, without imposing a separate written license agreement -for such Enhancements, then you hereby grant the following license: a -non-exclusive, royalty-free perpetual license to install, use, modify, -prepare derivative works, incorporate into other computer software, -distribute, and sublicense such enhancements or derivative works thereof, -in binary and source code form. diff --git a/third_party/gauxc/NOTICE.md b/third_party/gauxc/NOTICE.md deleted file mode 100644 index 4fcbf5d..0000000 --- a/third_party/gauxc/NOTICE.md +++ /dev/null @@ -1,38 +0,0 @@ -# NOTICES - -This repository incorporates material as listed below or described in the code. - -------------------------------------------------------------------------------- -gau2grid. - -BSD 3-Clause License - -Copyright (c) 2017, Daniel Smith -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- - diff --git a/third_party/gauxc/README.md b/third_party/gauxc/README.md index 082ac6c..1ebcc86 100644 --- a/third_party/gauxc/README.md +++ b/third_party/gauxc/README.md @@ -1,241 +1 @@ -# About - -GauXC - -Copyright (c) 2020-2024, The Regents of the University of California, -through Lawrence Berkeley National Laboratory (subject to receipt of -any required approvals from the U.S. Dept. of Energy). - -(c) 2024-2025, Microsoft Corporation - -All rights reserved. - - -NOTICE. This Software was developed under funding from the U.S. Department -of Energy and the U.S. Government consequently retains certain rights. As -such, the U.S. Government has been granted for itself and others acting on -its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the -Software to reproduce, distribute copies to the public, prepare derivative -works, and perform publicly and display publicly, and to permit others to do so. - -# Synopsis - -GauXC is a modern, modular C++ library for the evaluation of quantities related -to the exchange-correlation (XC) and exact-exchange (K) energy (e.g. potential, etc) in the Gaussian -basis set discretization of Kohn-Sham density function theory (KS-DFT). GauXC -provides efficient, scalable distributed memory XC and K integrators for both CPU and -accelerator-based (GPU) architectures. Currently, GPU support is provided through -the -[CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html) and -[HIP](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP-GUIDE.html) -frameworks to target NVIDIA and AMD GPUs, respectively. -Evaluation -of the XC functional CPU/accelerator architectures is provided by the -[ExchCXX](https://github.com/wavefunction91/ExchCXX) library. Quadratures are generated -by the [IntegratorXX](https://github.com/wavefunction91/IntegratorXX) library. - -# Design Goals - -* Provide a stable, portable and high-performance implementation of numerical -integrators optimized for the evaluation of XC and K related quantities in Gaussian -basis set KS-DFT on CPU and accelerator based architectures. -* Develop a modern, modular, extensible C++ software infrastructure which allows -for flexible and agile development in the field of KS-DFT. - -# Dependencies - -* CMake (3.20+) -* BLAS (for CPU integrators) -* [ExchCXX](https://github.com/wavefunction91/ExchCXX) -* [IntegratorXX](https://github.com/wavefunction91/IntegratorXX) -* [Gau2Grid](https://github.com/dgasmith/gau2grid) (pregenerated source packaged with GauXC) -* MPI (Optional) -* OpenMP (CPU parallelism, Optional) -* [Cereal](https://github.com/USCiLab/cereal) (Optional) -* [HDF5](https://www.hdfgroup.org/solutions/hdf5/) (Optional) -* [Eigen3](https://eigen.tuxfamily.org/dox/) (Testing Only) -* [CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)/[cuBLAS](https://docs.nvidia.com/cuda/cublas/index.html) (Required only if CUDA enabled) -* [HIP](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP-GUIDE.html)/[ROCm](https://github.com/RadeonOpenCompute/ROCm) (Required only if HIP enabled) -* [MAGMA](https://icl.utk.edu/magma/) (Optional if CUDA/HIP enabled) - -# Major Contributors - -See CONTRIBUTORS.md for a list of major contributors to GauXC. - -# Publications - -## GauXC -Please cite the following publications if GauXC was used in your publication: -``` -% Relativistic integrals -@article{kovtun2024relativistic, - author = {Kovtun, Mikael and Lambros, Eleftherios and Liu, Aodong and Tang, Diandong and Williams--Young, David B. and Li, Xiaosong}, - title = {Accelerating Relativistic Exact-Two-Component Density Functional Theory Calculations with Graphical Processing Units}, - journal = {Journal of Chemical Theory and Computation}, - volume = {20}, - number = {18}, - pages = {7694--7699}, - year = {2024}, - doi = {10.1021/acs.jctc.4c00843}, -} - -% Distributed Memory Seminumerical Exact Exchange implementation -@article{williams2023distributed, - title = {Distributed memory, GPU accelerated Fock construction for hybrid, Gaussian basis density functional theory}, - author = {Williams--Young, David B. and Asadchev, Andrey and Popovici, Doru Thom and Clark, David and Waldrop, Jonathan and - Windus, Theresa L. and Valeev, Edward F. and de Jong, Wibe A.}, - journal = {The Journal of Chemical Physics}, - volume = {158}, - number = {23}, - pages = {234104}, - year = {2023}, - doi = {10.1063/5.0151070}, - url = {https://doi.org/10.1063/5.0151070} -} - -% Performance Portability (HIP/SYCL implementations) -@article{williams2021achieving, - title={Achieving performance portability in Gaussian basis set density functional - theory on accelerator based architectures in NWChemEx}, - author={Williams--Young, David B and Bagusetty, Abhishek and de Jong, Wibe A and - Doerfler, Douglas and van Dam, Hubertus JJ and V{\'a}zquez-Mayagoitia, {\'A}lvaro and - Windus, Theresa L and Yang, Chao}, - journal={Parallel Computing}, - volume={108}, - pages={102829}, - year={2021}, - doi={10.1016/j.parco.2021.102829}, - url={https://www.sciencedirect.com/science/article/pii/S0167819121000776?via%3Dihub} -} - -% CUDA and distributed memory implementation -@article{williams20on, - author={David B. Williams--Young and Wibe A. de Jong and Hubertus J.J. van Dam and - Chao Yang}, - title={On the Efficient Evaluation of the Exchange Correlation Potential on - Graphics Processing Unit Clusters}, - journal={Frontiers in Chemistry}, - volume={8}, - pages={581058}, - year={2020}, - doi={10.3389/fchem.2020.581058}, - url={https://www.frontiersin.org/articles/10.3389/fchem.2020.581058/abstract}, - preprint={https://arxiv.org/abs/2007.03143} -} - -% Algorithm for XC potential assembly and shared-memory CPU implementation -@article{petrone18an, - author={Alessio Petrone and David B. Williams--Young and Shichao Sun and - Torin F. Stetina and Xiaosong Li}, - title={An Efficient Implementation of Two-Component Relativistic Density - Functional Theory with Torque-Free Auxiliary Variables}, - journal={The European Physical Journal B}, - volume={91}, - number={169}, - pages={169}, - year={2018}, - doi={10.1140/epjb/e2018-90170-1}, - url={https://link.springer.com/article/10.1140/epjb/e2018-90170-1} -} -``` - -## Density functionals - -If GauXC was used for the evaluation of exchange-correlation related -quantities in your publication, we request that you also cite -[Libxc](https://libxc.gitlab.io/) which provides the underlying -implementation of the exchange-correlation functionals used in GauXC -via the [ExchCXX](https://github.com/wavefunction91/ExchCXX) library: - -``` -% Actual Implementations of the Density Functionals -@article{lehtola2018libxc, - author = {Lehtola, Susi and Steigemann, Conrad and Oliveira, Micael J. T. and Marques, Miguel A. L.}, - journal = {SoftwareX}, - title = {Recent developments in {LIBXC}---a comprehensive library of functionals for density functional theory}, - year = {2018}, - pages = {1--5}, - volume = {7}, - doi = {10.1016/j.softx.2017.11.002}, -} -``` - -# Build Instructions - -GauXC provides a CMake build system with automatic dependency management (through [FetchContent](https://cmake.org/cmake/help/latest/module/FetchContent.html)). -As such, a simple CMake invocation will often suffice for most purposes -``` -cmake -S /path/to/gauxc -B /path/to/build [GauXC configure options] -cmake --build /path/to/build -``` - - -GauXC is linkable both as an installed library as well as a CMake subproject via `FetchContent` -``` -# GauXC Discovery -find_package( gauxc REQUIRED ) -target_link_libraries( my_target PUBLIC gauxc::gauxc ) -``` - -``` -# GauXC as CMake Subproject -include(FetchContent) - -# Set GauXC CMake options (see below) - -# Pull master branch of GauXC -FetchContent_Declare( gauxc - GIT_REPOSITORY https://github/com/wavefunction91/GauXC.git - GIT_TAG master -) -FetchContent_MakeAvailable( gauxc ) - -# Link to target -target_link_libraries( my_target PUBLIC gauxc::gauxc ) -``` - - -## Influential CMake Variables - -| Variable Name | Description | Default | -|----------------------------|-----------------------------------------------------------|----------| -| `GAUXC_ENABLE_TESTS` | Enable Testing Framework (Catch2) | `ON` | -| `GAUXC_ENABLE_HOST` | Enable HOST integrators | `ON` | -| `GAUXC_ENABLE_CUDA` | Enable CUDA integrators | `OFF` | -| `GAUXC_ENABLE_HIP` | Enable HIP integrators | `OFF` | -| `GAUXC_ENABLE_MAGMA` | Enable MAGMA for batched BLAS (No effect if no GPU) | `ON` | -| `GAUXC_ENABLE_CUTLASS` | Enable CUTLASS for batched BLAS (No effect if no CUDA) | `OFF` | -| `GAUXC_ENABLE_NCCL` | Enable NCCL bindings for topology aware GPU reductions | `OFF` | -| `GAUXC_ENABLE_MPI` | Enable MPI Bindings | `ON` | -| `GAUXC_ENABLE_OPENMP` | Enable OpenMP Bindings | `ON` | -| `CMAKE_CUDA_ARCHITECTURES` | CUDA architechtures (e.g. 70 for Volta, 80 for Ampere) | -- | -| `BLAS_LIBRARIES` | Full BLAS linker. | -- | -| `MAGMA_ROOT_DIR` | Install prefix for MAGMA. | -- | - - - - -# Example Usage - -See `test/standalone_driver.cxx` for an example end-to-end invocation of GauXC for various integrands. - - -# License - -GauXC is made freely available under the terms of a modified 3-Clause BSD license. See -LICENSE.txt for details. - -# Acknowledgments - -The development of GauXC was previously supported by the Exascale Computing Project -(17-SC-20-SC), a collaborative effort of the U.S. Department of Energy Office -of Science and the National Nuclear Security Administration. - -##Trademarks - -This project may contain trademarks or logos for projects, products, or -services. Authorized use of Microsoft trademarks or logos is subject to and -must follow Microsoft’s Trademark & Brand Guidelines. Use of Microsoft -trademarks or logos in modified versions of this project must not cause -confusion or imply Microsoft sponsorship. Any use of third-party trademarks or -logos are subject to those third-party’s policies. +The development version of GauXC with added support for Skala was moved to the `skala` branch of the main GauXC repo at https://github.com/wavefunction91/GauXC/tree/skala. diff --git a/third_party/gauxc/SECURITY.md b/third_party/gauxc/SECURITY.md deleted file mode 100644 index 656f791..0000000 --- a/third_party/gauxc/SECURITY.md +++ /dev/null @@ -1,14 +0,0 @@ - - -## Security - -Microsoft takes the security of our software products and services seriously, which -includes all source code repositories in our GitHub organizations. - -**Please do not report security vulnerabilities through public GitHub issues.** - -For security reporting information, locations, contact information, and policies, -please review the latest guidance for Microsoft repositories at -[https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md). - - diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/impl.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/impl.hpp deleted file mode 100644 index f297dc9..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/impl.hpp +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include - -namespace GauXC { - -template -XCIntegrator::XCIntegrator( std::unique_ptr&& pimpl ) : - pimpl_( std::move( pimpl ) ) { } - - -template -XCIntegrator::~XCIntegrator() noexcept = default; - -template -XCIntegrator::XCIntegrator(XCIntegrator&&) noexcept = default; - - -template -typename XCIntegrator::exc_vxc_type - XCIntegrator::eval_exc_vxc( const MatrixType& P ) { - if( not pimpl_ ) throw std::runtime_error("Not Initialized"); - return pimpl_->eval_exc_vxc(P); -}; - -template -const util::Timer& XCIntegrator::get_timings() const { - if( not pimpl_ ) throw std::runtime_error("Not Initialized"); - - return pimpl_->get_timings(); -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/integrator_factory.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/integrator_factory.hpp deleted file mode 100644 index 18cf57a..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/integrator_factory.hpp +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -#include - -namespace GauXC { - -template -XCIntegrator - make_default_integrator( ExecutionSpace ex, Args&&... args ) { - - using value_type = typename XCIntegrator::value_type; - - if( ex == ExecutionSpace::Host ) { - - return XCIntegrator( - std::make_unique>( - detail::make_reference_host_integrator_impl( - detail::forward_as_shared_ptr(args)... - ) - ) - ); - - } else { - - return XCIntegrator( - std::make_unique>( - detail::make_incore_device_integrator_impl( - //detail::make_shellbatched_device_integrator_impl( - detail::forward_as_shared_ptr(args)... - ) - ) - ); - - } - -} - -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/impl.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/impl.hpp deleted file mode 100644 index 51ba869..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/impl.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once - -#include - -namespace GauXC { -namespace detail { - - -template -ReplicatedXCIntegrator:: - ReplicatedXCIntegrator( std::unique_ptr&& pimpl ) : - pimpl_(std::move(pimpl)){ } - -template -ReplicatedXCIntegrator::ReplicatedXCIntegrator(): - ReplicatedXCIntegrator(nullptr){ } - -template -ReplicatedXCIntegrator::~ReplicatedXCIntegrator() noexcept = default; -template -ReplicatedXCIntegrator:: - ReplicatedXCIntegrator(ReplicatedXCIntegrator&&) noexcept = default; - -template -const util::Timer& ReplicatedXCIntegrator::get_timings_() const { - if( not pimpl_ ) throw std::runtime_error( "Not Initialized" ); - return pimpl_->get_timings(); -} - - -template -typename ReplicatedXCIntegrator::exc_vxc_type - ReplicatedXCIntegrator::eval_exc_vxc_( const MatrixType& P ) { - - matrix_type VXC( P.rows(), P.cols() ); - value_type EXC; - - if( not pimpl_ ) throw std::runtime_error( "Not Initialized" ); - pimpl_->eval_exc_vxc( P.rows(), P.cols(), P.data(), P.rows(), - VXC.data(), VXC.rows(), &EXC ); - - return std::make_tuple( EXC, VXC ); - -} - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/incore_xc_device_integrator.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/incore_xc_device_integrator.hpp deleted file mode 100644 index bc27141..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/incore_xc_device_integrator.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#pragma once -#include -#include -#include - -namespace GauXC { -namespace detail { - -#ifdef GAUXC_ENABLE_DEVICE -template -class IncoreXCDeviceIntegrator : public ReplicatedXCIntegratorImpl { - - using base_type = ReplicatedXCIntegratorImpl; - using value_type = typename base_type::value_type; - using basis_type = typename base_type::basis_type; - - XCIntegratorState state_; - - void eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) override; - -public: - - template - IncoreXCDeviceIntegrator( Args&&... args ) : - base_type( std::forward(args)... ) { } - - IncoreXCDeviceIntegrator( const IncoreXCDeviceIntegrator& ); - IncoreXCDeviceIntegrator( IncoreXCDeviceIntegrator&& ) noexcept; - - ~IncoreXCDeviceIntegrator() noexcept; - -}; - -extern template class IncoreXCDeviceIntegrator; -#endif - - -template -std::unique_ptr< ReplicatedXCIntegratorImpl > - make_incore_device_integrator_impl( Args&&... args ) { - -#ifdef GAUXC_ENABLE_DEVICE - return std::make_unique>( - std::forward(args)... - ); -#else - std::string msg = std::string(__PRETTY_FUNCTION__) + - ": GAUXC_ENABLE_DEVICE = FALSE"; - throw std::runtime_error(msg.c_str()); - return nullptr; -#endif - -} - - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/reference_xc_host_integrator.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/reference_xc_host_integrator.hpp deleted file mode 100644 index 46dbc75..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/reference_xc_host_integrator.hpp +++ /dev/null @@ -1,57 +0,0 @@ -#pragma once -#include -#include -#include - -namespace GauXC { -namespace detail { - -#ifdef GAUXC_ENABLE_HOST -template -class ReferenceXCHostIntegrator : public ReplicatedXCIntegratorImpl { - - using base_type = ReplicatedXCIntegratorImpl; - using value_type = typename base_type::value_type; - using basis_type = typename base_type::basis_type; - - XCIntegratorState state_; - - void eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) override; - -public: - - template - ReferenceXCHostIntegrator( Args&&... args ) : - base_type( std::forward(args)... ) { } - - ReferenceXCHostIntegrator( const ReferenceXCHostIntegrator& ); - ReferenceXCHostIntegrator( ReferenceXCHostIntegrator&& ) noexcept; - - ~ReferenceXCHostIntegrator() noexcept; - -}; - -extern template class ReferenceXCHostIntegrator; -#endif - - -template -std::unique_ptr< ReplicatedXCIntegratorImpl > - make_reference_host_integrator_impl( Args&&... args ) { - -#ifdef GAUXC_ENABLE_HOST - return std::make_unique>( - std::forward(args)... - ); -#else - throw std::runtime_error(__PRETTY_FUNCTION__ ": GAUXC_ENABLE_HOST = FALSE"); - return nullptr; -#endif - -} - - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/replicated_xc_integrator_impl.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/replicated_xc_integrator_impl.hpp deleted file mode 100644 index 0cdae2b..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/replicated_xc_integrator_impl.hpp +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace GauXC { -namespace detail { - -template -class ReplicatedXCIntegratorImpl { - -public: - - using value_type = ValueType; - using basis_type = BasisSet< value_type >; - -protected: - -#ifdef GAUXC_ENABLE_MPI - MPI_Comm comm_; -#endif - - std::shared_ptr< functional_type > func_; - std::shared_ptr< basis_type > basis_; - - std::shared_ptr< LoadBalancer > load_balancer_; - - util::Timer timer_; - - - virtual void eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) = 0; -public: - -#ifdef GAUXC_ENABLE_MPI - - ReplicatedXCIntegratorImpl( MPI_Comm comm, - std::shared_ptr< functional_type > func, - std::shared_ptr< basis_type > basis, - std::shared_ptr< LoadBalancer > lb ); - -#else - - ReplicatedXCIntegratorImpl( std::shared_ptr< functional_type > func, - std::shared_ptr< basis_type > basis, - std::shared_ptr< LoadBalancer > lb ); - -#endif - - virtual ~ReplicatedXCIntegratorImpl() noexcept; - - void eval_exc_vxc( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ); - - inline const util::Timer& get_timings() const { return timer_; } - -}; - - -extern template class ReplicatedXCIntegratorImpl; - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/shellbatched_xc_device_integrator.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/shellbatched_xc_device_integrator.hpp deleted file mode 100644 index a17f8ff..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated/shellbatched_xc_device_integrator.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#pragma once -#include -#include -#include - -namespace GauXC { -namespace detail { - -#ifdef GAUXC_ENABLE_DEVICE -template -class ShellBatchedXCDeviceIntegrator : public ReplicatedXCIntegratorImpl { - - using base_type = ReplicatedXCIntegratorImpl; - using value_type = typename base_type::value_type; - using basis_type = typename base_type::basis_type; - - XCIntegratorState state_; - - void eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) override; - -public: - - template - ShellBatchedXCDeviceIntegrator( Args&&... args ) : - base_type( std::forward(args)... ) { } - - ShellBatchedXCDeviceIntegrator( const ShellBatchedXCDeviceIntegrator& ); - ShellBatchedXCDeviceIntegrator( ShellBatchedXCDeviceIntegrator&& ) noexcept; - - ~ShellBatchedXCDeviceIntegrator() noexcept; - -}; - -extern template class ShellBatchedXCDeviceIntegrator; -#endif - - -template -std::unique_ptr< ReplicatedXCIntegratorImpl > - make_shellbatched_device_integrator_impl( Args&&... args ) { - -#ifdef GAUXC_ENABLE_DEVICE - return std::make_unique>( - std::forward(args)... - ); -#else - std::string msg = std::string(__PRETTY_FUNCTION__) + - ": GAUXC_ENABLE_DEVICE = FALSE"; - throw std::runtime_error(msg.c_str()); - return nullptr; -#endif - -} - - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated_xc_integrator.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated_xc_integrator.hpp deleted file mode 100644 index 548227c..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/replicated_xc_integrator.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -#include - -namespace GauXC { -namespace detail { - -template -class ReplicatedXCIntegratorImpl; - -template -class ReplicatedXCIntegrator : public XCIntegratorImpl { - -public: - - using matrix_type = typename XCIntegratorImpl::matrix_type; - using value_type = typename XCIntegratorImpl::value_type; - using exc_vxc_type = typename XCIntegratorImpl::exc_vxc_type; - -private: - - using pimpl_type = ReplicatedXCIntegratorImpl; - std::unique_ptr< pimpl_type > pimpl_; - - exc_vxc_type eval_exc_vxc_( const MatrixType& ) override; - const util::Timer& get_timings_() const override; - -public: - - ReplicatedXCIntegrator(); - ReplicatedXCIntegrator( std::unique_ptr&& ); - - ~ReplicatedXCIntegrator() noexcept; - - ReplicatedXCIntegrator( const ReplicatedXCIntegrator& ) = delete; - ReplicatedXCIntegrator( ReplicatedXCIntegrator&& ) noexcept; - -}; - - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/xc_integrator_impl.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/xc_integrator_impl.hpp deleted file mode 100644 index 09af24d..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/xc_integrator_impl.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once - -#include - -namespace GauXC { -namespace detail { - -template -class XCIntegratorImpl { - -public: - - using matrix_type = MatrixType; - using value_type = typename matrix_type::value_type; - using exc_vxc_type = typename XCIntegrator::exc_vxc_type; - -protected: - - virtual exc_vxc_type eval_exc_vxc_( const MatrixType& ) = 0; - virtual const util::Timer& get_timings_() const = 0; - -public: - - XCIntegratorImpl() = default; - XCIntegratorImpl( const XCIntegratorImpl& ) = default; - XCIntegratorImpl( XCIntegratorImpl&& ) noexcept = default; - virtual ~XCIntegratorImpl() noexcept = default; - - - exc_vxc_type eval_exc_vxc( const MatrixType& P ) { - return eval_exc_vxc_(P); - } - - const util::Timer& get_timings() const { - return get_timings_(); - } - -}; - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/xc_integrator_state.hpp b/third_party/gauxc/attic/include/gauxc/new_xc_integrator/xc_integrator_state.hpp deleted file mode 100644 index 4bc2113..0000000 --- a/third_party/gauxc/attic/include/gauxc/new_xc_integrator/xc_integrator_state.hpp +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -namespace GauXC { - -struct XCIntegratorState { - bool load_balancer_populated = false; - bool modified_weights_are_stored = false; -}; - -} diff --git a/third_party/gauxc/attic/include/gauxc/util/forward_as_shared_ptr.hpp b/third_party/gauxc/attic/include/gauxc/util/forward_as_shared_ptr.hpp deleted file mode 100644 index 44959df..0000000 --- a/third_party/gauxc/attic/include/gauxc/util/forward_as_shared_ptr.hpp +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once -#include -#include - -namespace GauXC { -namespace detail { - -template -std::shared_ptr> forward_as_shared_ptr( const T& t ) { - return std::make_shared>( t ); -} - -//template -//std::shared_ptr> forward_as_shared_ptr( T& t ) { -// std::cout << "Resolving Ref Copy Forward" << std::endl; -// return std::make_shared>( t ); -//} -// -//template -//std::shared_ptr> forward_as_shared_ptr( T&& t ) { -// std::cout << "Resolving Move Forward" << std::endl; -// return std::make_shared>( std::move(t) ); -//} - -template -std::shared_ptr forward_as_shared_ptr( std::shared_ptr ptr ) { - return ptr; -} - -// Disable forward for MPI_Comm -#ifdef GAUXC_ENABLE_MPI -MPI_Comm forward_as_shared_ptr( MPI_Comm comm ) { - return comm; -} -#endif - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/impl.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/impl.hpp deleted file mode 100644 index 10b7a4f..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/impl.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include - -namespace GauXC { - -template -XCIntegrator::XCIntegrator( std::unique_ptr&& pimpl ) : - pimpl_( std::move( pimpl ) ) { } - - -template -XCIntegrator::~XCIntegrator() noexcept = default; - -template -XCIntegrator::XCIntegrator(XCIntegrator&&) noexcept = default; - - -template -typename XCIntegrator::exc_vxc_type - XCIntegrator::eval_exc_vxc( const MatrixType& P ) { - if( not pimpl_ ) throw std::runtime_error("Not Initialized"); - - return pimpl_->eval_exc_vxc(P); -}; - -template -const util::Timer& XCIntegrator::get_timings() const { - if( not pimpl_ ) throw std::runtime_error("Not Initialized"); - - return pimpl_->get_timings(); -} -} diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/incore_xc_cuda_integrator.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/incore_xc_cuda_integrator.hpp deleted file mode 100644 index 3cdfa94..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/incore_xc_cuda_integrator.hpp +++ /dev/null @@ -1,210 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -#ifdef GAUXC_ENABLE_CUDA -namespace GauXC { -namespace detail { - -using namespace GauXC::integrator::cuda; - - -template -class IncoreXCCudaIntegrator : public XCIntegratorImpl { - - using base_type = XCIntegratorImpl; - using matrix_type = typename base_type::matrix_type; - using value_type = typename base_type::value_type; - using basisset_type = typename base_type::basisset_type; - using exc_vxc_type = typename base_type::exc_vxc_type; - - std::shared_ptr< XCCudaData< value_type > > cuda_data_; -#ifdef GAUXC_ENABLE_NCCL - std::unique_ptr nccl_comm_; -#endif - - exc_vxc_type eval_exc_vxc_( const MatrixType& ) override; - -public: - - template - IncoreXCCudaIntegrator( Args&&... args ) : - base_type( std::forward(args)... ) { -#ifdef GAUXC_ENABLE_NCCL - nccl_comm_ = std::make_unique< util::nccl_comm >( this->comm_ ); -#endif - } - - IncoreXCCudaIntegrator( const IncoreXCCudaIntegrator& ) = default; - IncoreXCCudaIntegrator( IncoreXCCudaIntegrator&& ) noexcept = default; - - ~IncoreXCCudaIntegrator() noexcept = default; - -}; - - - - -template -typename IncoreXCCudaIntegrator::exc_vxc_type - IncoreXCCudaIntegrator::eval_exc_vxc_( const MatrixType& P ) { - - -#ifdef GAUXC_ENABLE_MAGMA - // Initialize MAGMA - { - auto ierr = magma_init(); - GAUXC_MAGMA_ERROR( "MAGMA Init Failed", ierr ); - } -#endif - -#ifdef GAUXC_ENABLE_MPI - int32_t device_count, cur_device; - cudaGetDeviceCount( &device_count ); - cudaGetDevice( &cur_device ); - - int32_t world_rank, world_size; - MPI_Comm_rank( this->comm_, &world_rank ); - MPI_Comm_size( this->comm_, &world_size ); - - -/* XXX: Does not work on Summit - MPI_Comm node_comm; - MPI_Comm_split_type(this->comm_, MPI_COMM_TYPE_SHARED, 0, - MPI_INFO_NULL, &node_comm); - - int32_t node_rank, node_size; - MPI_Comm_rank( node_comm, &node_rank ); - MPI_Comm_size( node_comm, &node_size ); - - if( node_size > device_count ) - throw std::runtime_error("GauXC + CUDA Assumes MPI <-> GPU is 1-to-1"); - - cudaSetDevice( node_rank ); -*/ -#endif - - - size_t nbf = this->basis_->nbf(); - size_t nshells = this->basis_->size(); - - //// TODO: Check that P is sane - - - auto& tasks = this->load_balancer_->get_tasks(); - - //size_t max_npts = this->load_balancer_->max_npts(); - //size_t max_nbe = this->load_balancer_->max_nbe(); - //size_t max_npts_x_nbe = this->load_balancer_->max_npts_x_nbe(); - - size_t n_deriv = this->func_->is_gga() ? 1 : 0; - - this->timer_.time_op("XCIntegrator.CUDAAlloc", [&](){ - - // Allocate Memory - cuda_data_ = std::make_shared>( ); - - // Partition out static memory segments for incore algorithm - cuda_data_->allocate_static_data( - this->load_balancer_->molecule().size(), - n_deriv, - nbf, - nshells - ); - - }); - - // Results - matrix_type VXC( nbf, nbf ); - value_type EXC, N_EL; - - this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - - // Compute Local contributions to EXC / VXC - process_batches_cuda_replicated_density_incore_p< value_type>( - n_deriv, XCWeightAlg::SSF, *this->func_, *this->basis_, - this->load_balancer_->molecule(), this->load_balancer_->molmeta(), - *cuda_data_, tasks.begin(), tasks.end(), P.data(), - VXC.data(), &EXC, &N_EL - ); - - } ); - - // If we are not using NCCL then data transfer happens before reduction -#ifndef GAUXC_ENABLE_NCCL - this->timer_.time_op("XCIntegrator.CUDADtoHTransfer", [&](){ - device_transfer(*cuda_data_, VXC.data(), &EXC, &N_EL); - } ); -#endif - -#ifdef GAUXC_ENABLE_MPI - - if( world_size > 1 ) { - - this->timer_.time_op("XCIntegrator.AllReduce", [&]() { - -#ifdef GAUXC_ENABLE_NCCL - device_allreduce< value_type>(*nccl_comm_, *cuda_data_); -#else - // Test of communicator is an inter-communicator - // XXX: Can't think of a case when this would be true, but who knows... - int inter_flag; - MPI_Comm_test_inter( this->comm_, &inter_flag ); - - // Is Intra-communicator, Allreduce can be done inplace - if( not inter_flag ) { - - MPI_Allreduce( MPI_IN_PLACE, VXC.data(), nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - // Isn't Intra-communicator (weird), Allreduce can't be done inplace - } else { - - matrix_type VXC_cpy = VXC; - value_type EXC_cpy = EXC, N_EL_cpy = N_EL; - - MPI_Allreduce( VXC_cpy.data(), VXC.data(), nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( &EXC_cpy, &EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( &N_EL_cpy, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - } -#endif - } ); - - } - -#endif - - // If we are using NCCL then data transfer happens after reduction -#ifdef GAUXC_ENABLE_NCCL - this->timer_.time_op("XCIntegrator.CUDADtoHTransfer", [&](){ - device_transfer(*cuda_data_, VXC.data(), &EXC, &N_EL); - } ); -#endif - - - this->timer_.time_op("XCIntegrator.CUDAFree", [&](){ - cuda_data_.reset(); // Free up CUDA memory - } ); - -#ifdef GAUXC_ENABLE_MAGMA - // Finalize MAGMA - { - auto ierr = magma_finalize(); - GAUXC_MAGMA_ERROR( "MAGMA Finalize Failed", ierr ); - } -#endif - - return exc_vxc_type{EXC, std::move(VXC)}; - -} - -} -} -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/integrator_defaults.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/integrator_defaults.hpp deleted file mode 100644 index c6fcb76..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/integrator_defaults.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace GauXC { -namespace detail { - -template -using DefaultXCHostIntegrator = ReferenceXCHostIntegrator; - -template -//using DefaultXCCudaIntegrator = IncoreXCCudaIntegrator; -using DefaultXCCudaIntegrator = ShellBatchedXCCudaIntegrator; - - -#ifdef GAUXC_ENABLE_HOST -template -std::unique_ptr> - make_default_host_integrator( Args&&... args ) { - return std::make_unique>( - std::forward(args)... - ); -} -#endif - -#ifdef GAUXC_ENABLE_CUDA -template -std::unique_ptr> - make_default_cuda_integrator( Args&&... args ) { - return std::make_unique>( - std::forward(args)... - ); -} -#endif - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/integrator_factory.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/integrator_factory.hpp deleted file mode 100644 index 890d766..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/integrator_factory.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include -#include - -namespace GauXC { -namespace detail { - - -template -std::unique_ptr> - default_integrator_factory( ExecutionSpace ex, Args&&... args ) { - - if( ex == ExecutionSpace::Host ) { - -#ifdef GAUXC_ENABLE_HOST - return make_default_host_integrator( - forward_as_shared_ptr(args)... - ); -#else - throw std::runtime_error("GAUXC_ENABLE_HOST is FALSE"); - return nullptr; -#endif - - } else { - -#ifdef GAUXC_ENABLE_CUDA - return make_default_cuda_integrator( forward_as_shared_ptr(args)... ); -#else - throw std::runtime_error("GAUXC_ENABLE_DEVICE is FALSE"); - return nullptr; -#endif - - } -} - -} - - -template -XCIntegrator - make_default_integrator( ExecutionSpace ex, Args&&... args ) { - - return XCIntegrator( - detail::default_integrator_factory( ex, - std::forward(args)... - ) - ); - -} - - -} diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/reference_xc_host_integrator.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/reference_xc_host_integrator.hpp deleted file mode 100644 index 6080c92..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/reference_xc_host_integrator.hpp +++ /dev/null @@ -1,128 +0,0 @@ -#pragma once -#include -#include -#include -#include - -#ifdef GAUXC_ENABLE_HOST -namespace GauXC { -namespace detail { - -using namespace GauXC::integrator::host; - - -template -class ReferenceXCHostIntegrator : public XCIntegratorImpl { - - using base_type = XCIntegratorImpl; - using matrix_type = typename base_type::matrix_type; - using value_type = typename base_type::value_type; - using basisset_type = typename base_type::basisset_type; - using exc_vxc_type = typename base_type::exc_vxc_type; - - std::shared_ptr< XCHostData< value_type > > host_data_; - - exc_vxc_type eval_exc_vxc_( const MatrixType& ) override; - -public: - - template - ReferenceXCHostIntegrator( Args&&... args ) : - base_type( std::forward(args)... ) { } - - ReferenceXCHostIntegrator( const ReferenceXCHostIntegrator& ) = default; - ReferenceXCHostIntegrator( ReferenceXCHostIntegrator&& ) noexcept = default; - - ~ReferenceXCHostIntegrator() noexcept = default; - -}; - - - - -template -typename ReferenceXCHostIntegrator::exc_vxc_type - ReferenceXCHostIntegrator::eval_exc_vxc_( const MatrixType& P ) { - - size_t nbf = this->basis_->nbf(); - - //// TODO: Check that P is sane - - - auto& tasks = this->load_balancer_->get_tasks(); - - size_t max_npts = this->load_balancer_->max_npts(); - size_t max_nbe = this->load_balancer_->max_nbe(); - size_t max_npts_x_nbe = this->load_balancer_->max_npts_x_nbe(); - - size_t n_deriv = this->func_->is_gga() ? 1 : 0; - - // Allocate Memory - host_data_ = std::make_shared>( - n_deriv, nbf, max_npts, max_npts_x_nbe - ); - - - // Results - matrix_type VXC( nbf, nbf ); - value_type EXC, N_EL; - - // Compute Local contributions to EXC / VXC - process_batches_host_replicated_p< value_type>( - n_deriv, this->integrator_state_, XCWeightAlg::SSF, *this->func_, - *this->basis_, this->load_balancer_->molecule(), - this->load_balancer_->molmeta(), *host_data_, tasks, P.data(), - VXC.data(), &EXC, &N_EL - ); - - // Update State of Integrator - this->integrator_state_.load_balancer_populated = true; - this->integrator_state_.modified_weights_are_stored = true; - - -#ifdef GAUXC_ENABLE_MPI - - int world_size; - MPI_Comm_size( this->comm_, &world_size ); - - if( world_size > 1 ) { - - // Test of communicator is an inter-communicator - // XXX: Can't think of a case when this would be true, but who knows... - int inter_flag; - MPI_Comm_test_inter( this->comm_, &inter_flag ); - - // Is Intra-communicator, Allreduce can be done inplace - if( not inter_flag ) { - - MPI_Allreduce( MPI_IN_PLACE, VXC.data(), nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - // Isn't Intra-communicator (weird), Allreduce can't be done inplace - } else { - - matrix_type VXC_cpy = VXC; - value_type EXC_cpy = EXC, N_EL_cpy = N_EL; - - MPI_Allreduce( VXC_cpy.data(), VXC.data(), nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( &EXC_cpy, &EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( &N_EL_cpy, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - - } - - } - -#endif - - - return exc_vxc_type{EXC, std::move(VXC)}; - -} - -} -} -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/shellbatched_xc_cuda_integrator.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/shellbatched_xc_cuda_integrator.hpp deleted file mode 100644 index 2b2551a..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/shellbatched_xc_cuda_integrator.hpp +++ /dev/null @@ -1,176 +0,0 @@ -#pragma once -#include -#include -#include -#include - -#ifdef GAUXC_ENABLE_CUDA -namespace GauXC { -namespace detail { - -using namespace GauXC::integrator::cuda; - - -template -class ShellBatchedXCCudaIntegrator : public XCIntegratorImpl { - - using base_type = XCIntegratorImpl; - using matrix_type = typename base_type::matrix_type; - using value_type = typename base_type::value_type; - using basisset_type = typename base_type::basisset_type; - using exc_vxc_type = typename base_type::exc_vxc_type; - - std::shared_ptr< XCCudaData< value_type > > cuda_data_; - - exc_vxc_type eval_exc_vxc_( const MatrixType& ) override; - -public: - - template - ShellBatchedXCCudaIntegrator( Args&&... args ) : - base_type( std::forward(args)... ) { } - - ShellBatchedXCCudaIntegrator( const ShellBatchedXCCudaIntegrator& ) = default; - ShellBatchedXCCudaIntegrator( ShellBatchedXCCudaIntegrator&& ) noexcept = default; - - ~ShellBatchedXCCudaIntegrator() noexcept = default; - -}; - - - - -template -typename ShellBatchedXCCudaIntegrator::exc_vxc_type - ShellBatchedXCCudaIntegrator::eval_exc_vxc_( const MatrixType& P ) { - - -#ifdef GAUXC_ENABLE_MAGMA - // Initialize MAGMA - { - auto ierr = magma_init(); - GAUXC_MAGMA_ERROR( "MAGMA Init Failed", ierr ); - } -#endif - -#ifdef GAUXC_ENABLE_MPI - int32_t device_count, cur_device; - cudaGetDeviceCount( &device_count ); - cudaGetDevice( &cur_device ); - - int32_t world_rank, world_size; - MPI_Comm_rank( this->comm_, &world_rank ); - MPI_Comm_size( this->comm_, &world_size ); - -/* XXX: Does not work on Summit - MPI_Comm node_comm; - MPI_Comm_split_type(this->comm_, MPI_COMM_TYPE_SHARED, 0, - MPI_INFO_NULL, &node_comm); - - int32_t node_rank, node_size; - MPI_Comm_rank( node_comm, &node_rank ); - MPI_Comm_size( node_comm, &node_size ); - - if( node_size > device_count ) - throw std::runtime_error("GauXC + CUDA Assumes MPI <-> GPU is 1-to-1"); - - cudaSetDevice( node_rank ); -*/ -#endif - - - size_t nbf = this->basis_->nbf(); - size_t nshells = this->basis_->size(); - - //// TODO: Check that P is sane - - - auto& tasks = this->load_balancer_->get_tasks(); - - //size_t max_npts = this->load_balancer_->max_npts(); - //size_t max_nbe = this->load_balancer_->max_nbe(); - //size_t max_npts_x_nbe = this->load_balancer_->max_npts_x_nbe(); - - size_t n_deriv = this->func_->is_gga() ? 1 : 0; - - this->timer_.time_op("XCIntegrator.CUDAAlloc", [&](){ - - // Allocate Memory - cuda_data_ = std::make_shared>( ); - - }); - - // Results - matrix_type VXC( nbf, nbf ); - value_type EXC, N_EL; - - this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - - // Compute Local contributions to EXC / VXC - process_batches_cuda_replicated_density_shellbatched_p< value_type>( - n_deriv, this->timer_, XCWeightAlg::SSF, *this->func_, *this->basis_, - this->load_balancer_->molecule(), this->load_balancer_->molmeta(), - *cuda_data_, tasks.begin(), tasks.end(), P.data(), - VXC.data(), &EXC, &N_EL - ); - - } ); - - this->timer_.time_op("XCIntegrator.CUDAFree", [&](){ - cuda_data_.reset(); // Free up CUDA memory - } ); - -#ifdef GAUXC_ENABLE_MPI - - - if( world_size > 1 ) { - - this->timer_.time_op("XCIntegrator.AllReduce", [&]() { - - // Test of communicator is an inter-communicator - // XXX: Can't think of a case when this would be true, but who knows... - int inter_flag; - MPI_Comm_test_inter( this->comm_, &inter_flag ); - - // Is Intra-communicator, Allreduce can be done inplace - if( not inter_flag ) { - - MPI_Allreduce( MPI_IN_PLACE, VXC.data(), nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - // Isn't Intra-communicator (weird), Allreduce can't be done inplace - } else { - - matrix_type VXC_cpy = VXC; - value_type EXC_cpy = EXC, N_EL_cpy = N_EL; - - MPI_Allreduce( VXC_cpy.data(), VXC.data(), nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( &EXC_cpy, &EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( &N_EL_cpy, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - } - - } ); - - } - -#endif - -#ifdef GAUXC_ENABLE_MAGMA - // Finalize MAGMA - { - auto ierr = magma_finalize(); - GAUXC_MAGMA_ERROR( "MAGMA Finalize Failed", ierr ); - } -#endif - - return exc_vxc_type{EXC, std::move(VXC)}; - -} - -} -} -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_cuda_data.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_cuda_data.hpp deleted file mode 100644 index c65a810..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_cuda_data.hpp +++ /dev/null @@ -1,126 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef GAUXC_ENABLE_CUDA - -namespace GauXC { - -template -struct XCCudaData { - - size_t nshells = 0; - size_t nbf = 0; - size_t n_deriv = 0; - size_t natoms = 0; - size_t LDatoms = 0; - - bool batch_l3_blas = true; - - void* device_ptr = nullptr; - void* dynmem_ptr = nullptr; - size_t devmem_sz = 0; - size_t dynmem_sz = 0; - - Shell* shells_device = nullptr; - Shell* important_shells_device = nullptr; - - F* vxc_device = nullptr; - F* nbe_scr_device = nullptr; - F* dmat_device = nullptr; - F* zmat_device = nullptr; - F* bf_eval_device = nullptr; - - F* dbf_x_eval_device = nullptr; - F* dbf_y_eval_device = nullptr; - F* dbf_z_eval_device = nullptr; - - F* den_eval_device = nullptr; - F* den_x_eval_device = nullptr; - F* den_y_eval_device = nullptr; - F* den_z_eval_device = nullptr; - F* eps_eval_device = nullptr; - F* gamma_eval_device = nullptr; - - F* vrho_eval_device = nullptr; - F* vgamma_eval_device = nullptr; - - - F* exc_device = nullptr; - F* nel_device = nullptr; - F* acc_scr_device = nullptr; - - F* rab_device = nullptr; - F* coords_device = nullptr; - - F** dmat_array_device = nullptr; - F** zmat_array_device = nullptr; - F** bf_array_device = nullptr; - - int* m_array_device = nullptr; - int* n_array_device = nullptr; - int* k_array_device = nullptr; - int* lda_array_device = nullptr; - int* ldb_array_device = nullptr; - int* ldc_array_device = nullptr; - - F* dist_scratch_device = nullptr; - - // Buffer Vars - F* points_device_buffer = nullptr; - F* weights_device_buffer = nullptr; - size_t* shell_list_device_buffer = nullptr; - size_t* shell_offs_device_buffer = nullptr; - int32_t* submat_cut_device_buffer = nullptr; - int32_t* submat_block_device_buffer = nullptr; - int32_t* iparent_device_buffer = nullptr; - F* dist_nearest_buffer = nullptr; - - cuda::XCTaskDevice* device_tasks = nullptr; - - // Execution management - std::unique_ptr master_stream = nullptr; - std::unique_ptr master_handle = nullptr; - -#ifdef GAUXC_ENABLE_MAGMA - std::unique_ptr master_magma_queue = nullptr; -#endif - - std::vector blas_streams; - std::vector blas_handles; - - XCCudaData( bool _batch_l3_blas = true ); - - ~XCCudaData() noexcept; - XCCudaData( const XCCudaData& ) = delete; - XCCudaData( XCCudaData&& ) noexcept = delete; - - - using task_iterator = std::vector< XCTask >::iterator; - using device_task_container = std::vector< cuda::XCTaskDevice >; - - - void allocate_static_data( size_t _natoms, - size_t _n_deriv, - size_t _nbf, - size_t _nshells ); - - - std::tuple< task_iterator, device_task_container > - generate_buffers( const BasisSet& basis, - task_iterator task_begin, - task_iterator task_end ); - -}; - -} - -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_cuda_util.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_cuda_util.hpp deleted file mode 100644 index fb67748..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_cuda_util.hpp +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once -#include -#include - -#include - -#ifdef GAUXC_ENABLE_CUDA -namespace GauXC { -namespace integrator { -namespace cuda { - -using host_task_iterator = std::vector::iterator; - -template -void process_batches_cuda_replicated_density_incore_p( - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCCudaData & cuda_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* exc, - F* n_el -); - - -template -inline void process_batches_cuda_replicated_density_incore_p( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - process_batches_cuda_replicated_density_incore_p( std::forward(args)... ); - else if( n_deriv == 1 ) - process_batches_cuda_replicated_density_incore_p( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - - - -template -void process_batches_cuda_replicated_density_shellbatched_p( - util::Timer& timer, - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCCudaData & cuda_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* exc, - F* n_el -); - -#ifdef GAUXC_ENABLE_NCCL -template -void device_allreduce( - ncclComm_t nccl_comm, - XCCudaData & cuda_data -); -#endif - -template -void device_transfer( - XCCudaData & cuda_data, - F* VXC, - F* EXC, - F* NEL -); - -template -inline void process_batches_cuda_replicated_density_shellbatched_p( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - process_batches_cuda_replicated_density_shellbatched_p( std::forward(args)... ); - else if( n_deriv == 1 ) - process_batches_cuda_replicated_density_shellbatched_p( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - -} -} -} -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_host_data.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_host_data.hpp deleted file mode 100644 index 2be087f..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_host_data.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once -#include -#include -#include - -#include - -#ifdef GAUXC_ENABLE_HOST -namespace GauXC { - -template -struct XCHostData { - - std::vector eps; - std::vector gamma; - std::vector vrho; - std::vector vgamma; - - std::vector zmat; - std::vector nbe_scr; - std::vector den_scr; - std::vector basis_eval; - - - XCHostData( size_t n_deriv, - size_t nbf, - size_t max_npts, - size_t max_npts_x_nbe ) : - eps( max_npts ), - gamma( (n_deriv > 0) * max_npts ), - vrho( max_npts ), - vgamma( (n_deriv > 0) * max_npts ), - zmat( max_npts_x_nbe ), - nbe_scr( nbf * nbf ), - den_scr( (3*n_deriv + 1) * max_npts ), - basis_eval( (3*n_deriv + 1) * max_npts_x_nbe ) { } - - -}; - -} -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_host_util.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_host_util.hpp deleted file mode 100644 index a6d4d02..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_host_util.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once -#include - -#include -#include "xc_integrator_state.hpp" - -#ifdef GAUXC_ENABLE_HOST -namespace GauXC { -namespace integrator { -namespace host { - - -template -void process_batches_host_replicated_p( - XCIntegratorState integrator_state, - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCHostData & host_data, - std::vector< XCTask >& local_work, - const F* P, - F* VXC, - F* exc, - F* n_el -); - - -template -inline void process_batches_host_replicated_p( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - process_batches_host_replicated_p( std::forward(args)... ); - else if( n_deriv == 1 ) - process_batches_host_replicated_p( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - -} -} -} -#endif diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_integrator_impl.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_integrator_impl.hpp deleted file mode 100644 index 442f427..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_integrator_impl.hpp +++ /dev/null @@ -1,72 +0,0 @@ -#pragma once - -#include -#include "xc_integrator_state.hpp" - -namespace GauXC { -namespace detail { - -template -class XCIntegratorImpl { - -public: - - using matrix_type = MatrixType; - using value_type = typename matrix_type::value_type; - using basisset_type = typename XCIntegrator::basisset_type; - using exc_vxc_type = typename XCIntegrator::exc_vxc_type; - -protected: - -#ifdef GAUXC_ENABLE_MPI - MPI_Comm comm_; -#endif - std::shared_ptr func_; - std::shared_ptr basis_; - - std::shared_ptr load_balancer_; - XCIntegratorState integrator_state_; - - util::Timer timer_; - - virtual exc_vxc_type eval_exc_vxc_( const MatrixType& ) = 0; - -public: - -#ifdef GAUXC_ENABLE_MPI - - XCIntegratorImpl( MPI_Comm comm, - std::shared_ptr func, - std::shared_ptr basis, - std::shared_ptr lb - ) : comm_(comm), func_(func), basis_(basis), load_balancer_(lb) { }; - -#else - - XCIntegratorImpl( std::shared_ptr func, - std::shared_ptr basis, - std::shared_ptr lb - ) : func_(func), basis_(basis), load_balancer_(lb) { }; - -#endif - - - - XCIntegratorImpl( const XCIntegratorImpl& ) = default; - XCIntegratorImpl( XCIntegratorImpl&& ) noexcept = default; - - - virtual ~XCIntegratorImpl() noexcept = default; - - - exc_vxc_type eval_exc_vxc( const MatrixType& P ) { - return eval_exc_vxc_(P); - } - - const util::Timer& get_timings() const { - return timer_; - } -}; - -} -} diff --git a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_integrator_state.hpp b/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_integrator_state.hpp deleted file mode 100644 index 4bc2113..0000000 --- a/third_party/gauxc/attic/include/gauxc/xc_integrator/xc_integrator_state.hpp +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -namespace GauXC { - -struct XCIntegratorState { - bool load_balancer_populated = false; - bool modified_weights_are_stored = false; -}; - -} diff --git a/third_party/gauxc/attic/src/integrator/CMakeLists.txt b/third_party/gauxc/attic/src/integrator/CMakeLists.txt deleted file mode 100644 index f6fe2b4..0000000 --- a/third_party/gauxc/attic/src/integrator/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Common Integrator Utilities -target_sources( gauxc PRIVATE integrator_common.cxx ) -target_include_directories( gauxc - PUBLIC - $ -) - -# Host Integrator Utilities -if( GAUXC_ENABLE_HOST ) - include( host/gauxc-host_integrator.cmake ) -endif() - -if( GAUXC_ENABLE_CUDA ) - include( cuda/gauxc-cuda_integrator.cmake ) -endif() diff --git a/third_party/gauxc/attic/src/integrator/cuda/buffer_adaptor.hpp b/third_party/gauxc/attic/src/integrator/cuda/buffer_adaptor.hpp deleted file mode 100644 index 130f1f1..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/buffer_adaptor.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -namespace GauXC { - -class buffer_adaptor { - - size_t nalloc_; - size_t nleft_; - void* top_; - void* stack_; - -public: - - buffer_adaptor() = delete; - - inline buffer_adaptor( void* ptr, size_t len ) : - nalloc_(len), - nleft_(len), - top_(ptr), - stack_(ptr) { } - - template - T* aligned_alloc( size_t len, - size_t align = alignof(T) ) { - - char* old_stack = (char*)stack_; - if( std::align( align, - len*sizeof(T), - stack_, - nleft_ ) ) { - - T* result = reinterpret_cast(stack_); - stack_ = (char*)stack_ + len*sizeof(T); - nleft_ -= std::distance( old_stack, - (char*)stack_ ); - return result; - - } - - throw std::bad_alloc(); - - } - - inline void* stack() const {return stack_;} - inline size_t nleft() const { return nleft_; } - -}; - - -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_angular_cartesian.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_angular_cartesian.hpp deleted file mode 100644 index 32088f5..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_angular_cartesian.hpp +++ /dev/null @@ -1,308 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace cuda { - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_0( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_0_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x; - - eval_y[npts * 0] = bf_y; - - eval_z[npts * 0] = bf_z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_1( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x; - eval[npts * 1] = bf*y; - eval[npts * 2] = bf*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_1_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf + bf_x*x; - eval_x[npts * 1] = bf_x*y; - eval_x[npts * 2] = bf_x*z; - - eval_y[npts * 0] = bf_y*x; - eval_y[npts * 1] = bf + bf_y*y; - eval_y[npts * 2] = bf_y*z; - - eval_z[npts * 0] = bf_z*x; - eval_z[npts * 1] = bf_z*y; - eval_z[npts * 2] = bf + bf_z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_2( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x*x; - eval[npts * 1] = bf*x*y; - eval[npts * 2] = bf*x*z; - eval[npts * 3] = bf*y*y; - eval[npts * 4] = bf*y*z; - eval[npts * 5] = bf*z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_2_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = x*(2*bf + bf_x*x); - eval_x[npts * 1] = y*(bf + bf_x*x); - eval_x[npts * 2] = z*(bf + bf_x*x); - eval_x[npts * 3] = bf_x*y*y; - eval_x[npts * 4] = bf_x*y*z; - eval_x[npts * 5] = bf_x*z*z; - - eval_y[npts * 0] = bf_y*x*x; - eval_y[npts * 1] = x*(bf + bf_y*y); - eval_y[npts * 2] = bf_y*x*z; - eval_y[npts * 3] = y*(2*bf + bf_y*y); - eval_y[npts * 4] = z*(bf + bf_y*y); - eval_y[npts * 5] = bf_y*z*z; - - eval_z[npts * 0] = bf_z*x*x; - eval_z[npts * 1] = bf_z*x*y; - eval_z[npts * 2] = x*(bf + bf_z*z); - eval_z[npts * 3] = bf_z*y*y; - eval_z[npts * 4] = y*(bf + bf_z*z); - eval_z[npts * 5] = z*(2*bf + bf_z*z); - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x*x*x; - eval[npts * 1] = bf*x*x*y; - eval[npts * 2] = bf*x*x*z; - eval[npts * 3] = bf*x*y*y; - eval[npts * 4] = bf*x*y*z; - eval[npts * 5] = bf*x*z*z; - eval[npts * 6] = bf*y*y*y; - eval[npts * 7] = bf*y*y*z; - eval[npts * 8] = bf*y*z*z; - eval[npts * 9] = bf*z*z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = x*x*(3*bf + bf_x*x); - eval_x[npts * 1] = x*y*(2*bf + bf_x*x); - eval_x[npts * 2] = x*z*(2*bf + bf_x*x); - eval_x[npts * 3] = y*y*(bf + bf_x*x); - eval_x[npts * 4] = y*z*(bf + bf_x*x); - eval_x[npts * 5] = z*z*(bf + bf_x*x); - eval_x[npts * 6] = bf_x*y*y*y; - eval_x[npts * 7] = bf_x*y*y*z; - eval_x[npts * 8] = bf_x*y*z*z; - eval_x[npts * 9] = bf_x*z*z*z; - - eval_y[npts * 0] = bf_y*x*x*x; - eval_y[npts * 1] = x*x*(bf + bf_y*y); - eval_y[npts * 2] = bf_y*x*x*z; - eval_y[npts * 3] = x*y*(2*bf + bf_y*y); - eval_y[npts * 4] = x*z*(bf + bf_y*y); - eval_y[npts * 5] = bf_y*x*z*z; - eval_y[npts * 6] = y*y*(3*bf + bf_y*y); - eval_y[npts * 7] = y*z*(2*bf + bf_y*y); - eval_y[npts * 8] = z*z*(bf + bf_y*y); - eval_y[npts * 9] = bf_y*z*z*z; - - eval_z[npts * 0] = bf_z*x*x*x; - eval_z[npts * 1] = bf_z*x*x*y; - eval_z[npts * 2] = x*x*(bf + bf_z*z); - eval_z[npts * 3] = bf_z*x*y*y; - eval_z[npts * 4] = x*y*(bf + bf_z*z); - eval_z[npts * 5] = x*z*(2*bf + bf_z*z); - eval_z[npts * 6] = bf_z*y*y*y; - eval_z[npts * 7] = y*y*(bf + bf_z*z); - eval_z[npts * 8] = y*z*(2*bf + bf_z*z); - eval_z[npts * 9] = z*z*(3*bf + bf_z*z); - -} - - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - if( l == 0 ) { - - collocation_cartesian_angular_0( npts, bf, x, y, z, eval ); - - } else if( l == 1 ) { - - collocation_cartesian_angular_1( npts, bf, x, y, z, eval ); - - } else if( l == 2 ) { - - collocation_cartesian_angular_2( npts, bf, x, y, z, eval ); - - } else if( l == 3 ) { - - collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_cartesian_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - - if( l == 0 ) { - - collocation_cartesian_angular_0( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_0_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 1 ) { - - collocation_cartesian_angular_1( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_1_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 2 ) { - - collocation_cartesian_angular_2( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_2_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 3 ) { - - collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_cartesian_angular_deriv1 - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_angular_spherical_unnorm.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_angular_spherical_unnorm.hpp deleted file mode 100644 index 9de5f11..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_angular_spherical_unnorm.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace cuda { - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_0( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_0_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x; - - eval_y[npts * 0] = bf_y; - - eval_z[npts * 0] = bf_z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_1( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*y; - eval[npts * 1] = bf*z; - eval[npts * 2] = bf*x; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_1_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x*y; - eval_x[npts * 1] = bf_x*z; - eval_x[npts * 2] = bf + bf_x*x; - - eval_y[npts * 0] = bf + bf_y*y; - eval_y[npts * 1] = bf_y*z; - eval_y[npts * 2] = bf_y*x; - - eval_z[npts * 0] = bf_z*y; - eval_z[npts * 1] = bf + bf_z*z; - eval_z[npts * 2] = bf_z*x; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_2( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = sqrt_3*bf*x*y; - eval[npts * 1] = sqrt_3*bf*y*z; - eval[npts * 2] = bf*(-x*x - y*y + 2*z*z)/2; - eval[npts * 3] = sqrt_3*bf*x*z; - eval[npts * 4] = sqrt_3*bf*(x*x - y*y)/2; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_2_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = sqrt_3*y*(bf + bf_x*x); - eval_x[npts * 1] = sqrt_3*bf_x*y*z; - eval_x[npts * 2] = -bf*x - bf_x*(x*x + y*y - 2*z*z)/2; - eval_x[npts * 3] = sqrt_3*z*(bf + bf_x*x); - eval_x[npts * 4] = sqrt_3*(bf*x + bf_x*(x*x - y*y)/2); - - eval_y[npts * 0] = sqrt_3*x*(bf + bf_y*y); - eval_y[npts * 1] = sqrt_3*z*(bf + bf_y*y); - eval_y[npts * 2] = -bf*y - bf_y*(x*x + y*y - 2*z*z)/2; - eval_y[npts * 3] = sqrt_3*bf_y*x*z; - eval_y[npts * 4] = sqrt_3*(-bf*y + bf_y*(x*x - y*y)/2); - - eval_z[npts * 0] = sqrt_3*bf_z*x*y; - eval_z[npts * 1] = sqrt_3*y*(bf + bf_z*z); - eval_z[npts * 2] = 2*bf*z - bf_z*(x*x + y*y - 2*z*z)/2; - eval_z[npts * 3] = sqrt_3*x*(bf + bf_z*z); - eval_z[npts * 4] = sqrt_3*bf_z*(x*x - y*y)/2; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = sqrt_10*bf*y*(3*x*x - y*y)/4; - eval[npts * 1] = sqrt_15*bf*x*y*z; - eval[npts * 2] = sqrt_6*bf*y*(-x*x - y*y + 4*z*z)/4; - eval[npts * 3] = bf*z*(-3*x*x - 3*y*y + 2*z*z)/2; - eval[npts * 4] = sqrt_6*bf*x*(-x*x - y*y + 4*z*z)/4; - eval[npts * 5] = sqrt_15*bf*z*(x*x - y*y)/2; - eval[npts * 6] = sqrt_10*bf*x*(x*x - 3*y*y)/4; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = sqrt_10*y*(6*bf*x + bf_x*(3*x*x - y*y))/4; - eval_x[npts * 1] = sqrt_15*y*z*(bf + bf_x*x); - eval_x[npts * 2] = -sqrt_6*y*(2*bf*x + bf_x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 3] = -z*(6*bf*x + bf_x*(3*x*x + 3*y*y - 2*z*z))/2; - eval_x[npts * 4] = -sqrt_6*(bf*(3*x*x + y*y - 4*z*z) + bf_x*x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 5] = sqrt_15*z*(2*bf*x + bf_x*(x*x - y*y))/2; - eval_x[npts * 6] = sqrt_10*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; - - eval_y[npts * 0] = sqrt_10*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; - eval_y[npts * 1] = sqrt_15*x*z*(bf + bf_y*y); - eval_y[npts * 2] = -sqrt_6*(bf*(x*x + 3*y*y - 4*z*z) + bf_y*y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 3] = -z*(6*bf*y + bf_y*(3*x*x + 3*y*y - 2*z*z))/2; - eval_y[npts * 4] = -sqrt_6*x*(2*bf*y + bf_y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 5] = sqrt_15*z*(-2*bf*y + bf_y*(x*x - y*y))/2; - eval_y[npts * 6] = sqrt_10*x*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; - - eval_z[npts * 0] = sqrt_10*bf_z*y*(3*x*x - y*y)/4; - eval_z[npts * 1] = sqrt_15*x*y*(bf + bf_z*z); - eval_z[npts * 2] = sqrt_6*y*(8*bf*z - bf_z*(x*x + y*y - 4*z*z))/4; - eval_z[npts * 3] = -3*bf*(x*x + y*y - 2*z*z)/2 - bf_z*z*(3*x*x + 3*y*y - 2*z*z)/2; - eval_z[npts * 4] = sqrt_6*x*(8*bf*z - bf_z*(x*x + y*y - 4*z*z))/4; - eval_z[npts * 5] = sqrt_15*(bf + bf_z*z)*(x*x - y*y)/2; - eval_z[npts * 6] = sqrt_10*bf_z*x*(x*x - 3*y*y)/4; - -} - - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - if( l == 0 ) { - - collocation_spherical_unnorm_angular_0( npts, bf, x, y, z, eval ); - - } else if( l == 1 ) { - - collocation_spherical_unnorm_angular_1( npts, bf, x, y, z, eval ); - - } else if( l == 2 ) { - - collocation_spherical_unnorm_angular_2( npts, bf, x, y, z, eval ); - - } else if( l == 3 ) { - - collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_spherical_unnorm_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - - if( l == 0 ) { - - collocation_spherical_unnorm_angular_0( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_0_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 1 ) { - - collocation_spherical_unnorm_angular_1( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_1_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 2 ) { - - collocation_spherical_unnorm_angular_2( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_2_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 3 ) { - - collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_spherical_unnorm_angular_deriv1 - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_device_constants.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_device_constants.hpp deleted file mode 100644 index ef3fb6b..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_device_constants.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { -namespace cuda { - - constexpr double sqrt_15 = 3.872983346207417; - constexpr double sqrt_3 = 1.7320508075688772; - constexpr double sqrt_6 = 2.449489742783178; - constexpr double sqrt_10 = 3.1622776601683795; - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_radial.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_radial.hpp deleted file mode 100644 index 03d8efb..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/collocation_radial.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include - -#include - - -namespace GauXC { -namespace integrator { -namespace cuda { - -__inline__ __device__ void collocation_device_radial_eval( - const Shell& shell, - const double* pt, - double* x, - double* y, - double* z, - double* eval_device -) { - - const auto* O = shell.O_data(); - const auto* alpha = shell.alpha_data(); - const auto* coeff = shell.coeff_data(); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - *x = xc; - *y = yc; - *z = zc; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - double tmp = 0.; - for( uint32_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - *eval_device = tmp; - -} - - - -__inline__ __device__ void collocation_device_radial_eval_deriv1( - const Shell& shell, - const double* pt, - double* x, - double* y, - double* z, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z -) { - - const auto* O = shell.O_data(); - const auto* alpha = shell.alpha_data(); - const auto* coeff = shell.coeff_data(); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - *x = xc; - *y = yc; - *z = zc; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - double tmp = 0.; - double tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( uint32_t i = 0; i < nprim; ++i ) { - - const double a = alpha[i]; - const double e = coeff[i] * std::exp( - a * rsq ); - - const double ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - *eval_device = tmp; - *deval_device_x = tmp_x; - *deval_device_y = tmp_y; - *deval_device_z = tmp_z; - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - - diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/deprecated/gaueval_kernels_template.cu b/third_party/gauxc/attic/src/integrator/cuda/collocation/deprecated/gaueval_kernels_template.cu deleted file mode 100644 index c9d0a8c..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/deprecated/gaueval_kernels_template.cu +++ /dev/null @@ -1,129 +0,0 @@ -//#include -#include -#include - -#include "gaueval_kernels.hpp" -#include "gaueval_angular_cartesian.hpp" -#include "gaueval_angular_spherical.hpp" -#include "gaueval_angular_spherical_unnorm.hpp" - -namespace GauXC { - -__global__ -void gaueval_device_$(ang_name)_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const StaticShell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* O = device::array_data( shell.O ); - const auto* alpha = device::array_data( shell.alpha ); - const auto* coeff = device::array_data( shell.coeff ); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim; - double tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - double * bf_eval = eval_device + ibf + ipt*nbf; - gaueval_$(ang_name)_angular( shell.l, tmp, xc, yc, zc, bf_eval ); - - } - -} - - - -__global__ -void gaueval_device_$(ang_name)_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const StaticShell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* O = device::array_data( shell.O ); - const auto* alpha = device::array_data( shell.alpha ); - const auto* coeff = device::array_data( shell.coeff ); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim; - double tmp = 0.; - double tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const double a = alpha[i]; - const double e = coeff[i] * std::exp( - a * rsq ); - - const double ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - double * bf_eval = eval_device + ibf + ipt*nbf; - double * dx_eval = deval_device_x + ibf + ipt*nbf; - double * dy_eval = deval_device_y + ibf + ipt*nbf; - double * dz_eval = deval_device_z + ibf + ipt*nbf; - - gaueval_$(ang_name)_angular_deriv1( shell.l, tmp, tmp_x, tmp_y, tmp_z, xc, yc, zc, bf_eval, dx_eval, dy_eval, dz_eval ); - - } - - -} - - -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/deprecated/generate_bfeval.py b/third_party/gauxc/attic/src/integrator/cuda/collocation/deprecated/generate_bfeval.py deleted file mode 100644 index 178a979..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/deprecated/generate_bfeval.py +++ /dev/null @@ -1,440 +0,0 @@ -import cmath -import math -import os -import re -import sys -from math import factorial as fact - -import sympy -from scipy.special import binom as binomial -from sympy import I as symb_I -from sympy import exp as symb_exp -from sympy import factorial as symb_fact -from sympy import factorial2 as symb_fact2 - - -def generate_cartesian_ls(L): - l = [] - for i in range(L + 1): - lx = L - i - for j in range(i + 1): - ly = i - j - lz = L - lx - ly - - l.append([0, 0, 0]) - - for k in range(lx - 1): - l[-1][0] = l[-1][0] + 1 - for k in range(ly - 1): - l[-1][1] = l[-1][1] + 1 - for k in range(lz - 1): - l[-1][2] = l[-1][2] + 1 - - if lx > 0: - l[-1][0] = l[-1][0] + 1 - if ly > 0: - l[-1][1] = l[-1][1] + 1 - if lz > 0: - l[-1][2] = l[-1][2] + 1 - - return l - - -def generate_spherical_coeff(l, m, lx, ly, lz): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return 0.0 - - prefactor = fact(2.0 * lx) * fact(2.0 * ly) * fact(2.0 * lz) * fact(l) - prefactor = prefactor * fact(l - abs(m)) - prefactor = prefactor / (fact(2.0 * l) * fact(lx) * fact(ly) * fact(lz)) - prefactor = prefactor / fact(l + abs(m)) - prefactor = math.sqrt(prefactor) - - term1 = 0.0 - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + binomial(l, i) * binomial(i, j) * math.pow(-1, i) * fact( - 2 * l - 2 * i - ) / fact(l - abs(m) - 2 * i) - - term1 = term1 / math.pow(2, l) / fact(l) - - m_fact = 1.0 - if m < 0: - m_fact = -1.0 - - term2 = 0.0 + 0.0j - for k in range(j + 1): - z = cmath.exp(m_fact * math.pi / 2.0 * (abs(m) - lx + 2 * k) * 1.0j) - term2 = term2 + binomial(j, k) * binomial(abs(m), lx - 2 * k) * z - - val = prefactor * term1 * term2 - - if abs(val.real) < 1e-10: - val = 0.0 + val.imag * 1j - if abs(val.imag) < 1e-10: - val = val.real - - return val - - -def generate_spherical_coeff_symb(l, m, lx, ly, lz, unnorm=False): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return sympy.Integer(0) - - j_symb = sympy.Integer(j) - l_symb = sympy.Integer(l) - m_symb = sympy.Integer(abs(m)) - lx_symb = sympy.Integer(lx) - ly_symb = sympy.Integer(ly) - lz_symb = sympy.Integer(lz) - - prefactor = ( - symb_fact(2 * lx_symb) - * symb_fact(2 * ly_symb) - * symb_fact(2 * lz_symb) - * symb_fact(l_symb) - ) - prefactor = prefactor * symb_fact(l_symb - m_symb) - prefactor = prefactor / ( - symb_fact(2 * l_symb) - * symb_fact(lx_symb) - * symb_fact(ly_symb) - * symb_fact(lz_symb) - ) - prefactor = prefactor / symb_fact(l_symb + m_symb) - - # Ed's stupid normalization convention... - if unnorm: - prefactor = ( - prefactor - * symb_fact2(2 * l - 1) - / symb_fact2(2 * lx - 1) - / symb_fact2(2 * ly - 1) - / symb_fact2(2 * lz - 1) - ) - - prefactor = sympy.sqrt(prefactor) - - term1 = sympy.Integer(0) - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + sympy.Integer(binomial(l, i)) * sympy.Integer( - binomial(i, j) - ) * sympy.Integer(math.pow(-1, i)) * symb_fact( - 2 * l_symb - sympy.Integer(2 * i) - ) / symb_fact(l_symb - m_symb - sympy.Integer(2 * i)) - - term1 = term1 / (2**l_symb) / symb_fact(l) - - m_fact_symb = sympy.Integer(1) - if m < 0: - m_fact_symb = -m_fact_symb - - term2 = sympy.Integer(0) - for k in range(j + 1): - z = sympy.exp( - m_fact_symb - * sympy.pi - / 2 - * (m_symb - lx_symb + sympy.Integer(2 * k)) - * symb_I - ) - term2 = ( - term2 - + sympy.Integer(binomial(j, k)) - * sympy.Integer(binomial(abs(m), lx - 2 * k)) - * z - ) - - return prefactor * term1 * term2 - - -def generate_cartesian_angular(ls): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - - ang = [] - - for l in ls: - ang.append(r) - for i in range(l[0]): - ang[-1] = ang[-1] * x - for i in range(l[1]): - ang[-1] = ang[-1] * y - for i in range(l[2]): - ang[-1] = ang[-1] * z - - ang[-1] = ang[-1] / r - - return ang - - -def generate_spherical_angular(L, unnorm=False): - ls = generate_cartesian_ls(L) - angs = generate_cartesian_angular(ls) - - # r = sympy.symbols( 'r' ) - sph_angs = [] - for m in range(L + 1): - tmp_p = 0 - tmp_m = 0 - for i in range(len(ls)): - l = ls[i] - ang = angs[i] - - # c = generate_spherical_coeff( L, m, l[0],l[1],l[2] ) - c = generate_spherical_coeff_symb(L, m, l[0], l[1], l[2], unnorm) - - if m == 0: - tmp_p = tmp_p + c * ang - - else: - c_p = (c + sympy.conjugate(c)) / sympy.sqrt(2) - c_m = (c - sympy.conjugate(c)) / sympy.sqrt(2) / symb_I - - tmp_p = tmp_p + c_p * ang - tmp_m = tmp_m + c_m * ang - - sph_angs.append((m, tmp_p)) - if m > 0: - sph_angs.append((-m, tmp_m)) - - sph_angs = sorted(sph_angs, key=lambda x: x[0]) - - sph_angs_bare = [] - for a in sph_angs: - sph_angs_bare.append(sympy.simplify(a[1])) - - return sph_angs_bare - - -def generate_eval_lines(L, ang): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - [bf, bf_x, bf_y, bf_z] = sympy.symbols("bf bf_x bf_y bf_z", real=True) - - bf_eval_strs = [] - bf_x_eval_strs = [] - bf_y_eval_strs = [] - bf_z_eval_strs = [] - - for j in range(len(ang)): - a = ang[j] - a_x = sympy.diff(a, x) - a_y = sympy.diff(a, y) - a_z = sympy.diff(a, z) - - bf_eval = sympy.simplify(a * bf) - bf_x_eval = sympy.simplify(a_x * bf + a * bf_x) - bf_y_eval = sympy.simplify(a_y * bf + a * bf_y) - bf_z_eval = sympy.simplify(a_z * bf + a * bf_z) - - bf_eval_str = "eval[{}] = {};".format(j, bf_eval) - bf_x_eval_str = "eval_x[{}] = {};".format(j, bf_x_eval) - bf_y_eval_str = "eval_y[{}] = {};".format(j, bf_y_eval) - bf_z_eval_str = "eval_z[{}] = {};".format(j, bf_z_eval) - - if L >= 2: - for k in range(2, L + 1): - for X in ("x", "y", "z"): - pow_str = X + "**" + str(k) - repl_str = "" - for K in range(k - 1): - repl_str = repl_str + X + "*" - repl_str = repl_str + X - - bf_eval_str = bf_eval_str.replace(pow_str, repl_str) - bf_x_eval_str = bf_x_eval_str.replace(pow_str, repl_str) - bf_y_eval_str = bf_y_eval_str.replace(pow_str, repl_str) - bf_z_eval_str = bf_z_eval_str.replace(pow_str, repl_str) - - bf_eval_strs.append(bf_eval_str) - bf_x_eval_strs.append(bf_x_eval_str) - bf_y_eval_strs.append(bf_y_eval_str) - bf_z_eval_strs.append(bf_z_eval_str) - - return (bf_eval_strs, bf_x_eval_strs, bf_y_eval_strs, bf_z_eval_strs) - - -cart_header_fname = "gaueval_angular_cartesian.hpp" -sphr_header_fname = "gaueval_angular_spherical.hpp" -cons_header_fname = "gaueval_device_constants.hpp" - -cart_header_file = open(cart_header_fname, "w") -sphr_header_file = open(sphr_header_fname, "w") -cons_header_file = open(cons_header_fname, "w") - -L_max = 4 -do_libint_norm = False -# do_libint_norm = True - -preamble = """ -#pragma once -#include "gaueval_device_constants.hpp" - -#define GPGAUEVAL_INLINE __inline__ - -namespace GauXC { -""" - - -cart_header_file.write(preamble) -sphr_header_file.write(preamble) - -cartesian_bf_template = """ -GPGAUEVAL_INLINE __device__ void generate_cartesian_angular{}( - const double bf, - const double x, - const double y, - const double z, - double* eval -) {{ -""" - -cartesian_bf_deriv1_template = """ -GPGAUEVAL_INLINE __device__ void generate_cartesian_angular{}_deriv1( - const double bf, - const double bf_x, - const double bf_y, - const double bf_z, - const double x, - const double y, - const double z, - double* eval_x, - double* eval_y, - double* eval_z -) {{ -""" - -spherical_bf_template = cartesian_bf_template.replace("cartesian", "spherical") -spherical_bf_deriv1_template = cartesian_bf_deriv1_template.replace( - "cartesian", "spherical" -) - - -constant_lines = [] -for L in range(L_max + 1): - sph_ang = generate_spherical_angular(L, do_libint_norm) - car_ang = generate_cartesian_angular(generate_cartesian_ls(L)) - - sph_bf_eval_strs, sph_bf_x_eval_strs, sph_bf_y_eval_strs, sph_bf_z_eval_strs = ( - generate_eval_lines(L, sph_ang) - ) - car_bf_eval_strs, car_bf_x_eval_strs, car_bf_y_eval_strs, car_bf_z_eval_strs = ( - generate_eval_lines(L, car_ang) - ) - - cartesian_bf_prototype = cartesian_bf_template.format("_" + str(L)) - spherical_bf_prototype = spherical_bf_template.format("_" + str(L)) - cartesian_bf_deriv1_prototype = cartesian_bf_deriv1_template.format("_" + str(L)) - spherical_bf_deriv1_prototype = spherical_bf_deriv1_template.format("_" + str(L)) - - spherical_bf_func = spherical_bf_prototype + "\n" - for s in sph_bf_eval_strs: - spherical_bf_func = spherical_bf_func + " " + s + "\n" - spherical_bf_func = spherical_bf_func + "\n}\n" - - spherical_bf_deriv1_func = spherical_bf_deriv1_prototype + "\n" - for s in sph_bf_x_eval_strs: - spherical_bf_deriv1_func = spherical_bf_deriv1_func + " " + s + "\n" - spherical_bf_deriv1_func = spherical_bf_deriv1_func + "\n" - for s in sph_bf_y_eval_strs: - spherical_bf_deriv1_func = spherical_bf_deriv1_func + " " + s + "\n" - spherical_bf_deriv1_func = spherical_bf_deriv1_func + "\n" - for s in sph_bf_z_eval_strs: - spherical_bf_deriv1_func = spherical_bf_deriv1_func + " " + s + "\n" - spherical_bf_deriv1_func = spherical_bf_deriv1_func + "\n}\n" - - cartesian_bf_func = cartesian_bf_prototype + "\n" - for s in car_bf_eval_strs: - cartesian_bf_func = cartesian_bf_func + " " + s + "\n" - cartesian_bf_func = cartesian_bf_func + "\n}\n" - - cartesian_bf_deriv1_func = cartesian_bf_deriv1_prototype + "\n" - for s in car_bf_x_eval_strs: - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + " " + s + "\n" - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + "\n" - for s in car_bf_y_eval_strs: - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + " " + s + "\n" - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + "\n" - for s in car_bf_z_eval_strs: - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + " " + s + "\n" - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + "\n}\n" - - sqrt_regex = "sqrt\([0-9]+\)" - - sqrt_finds = re.findall(sqrt_regex, spherical_bf_func) - sqrt_finds = sqrt_finds + (re.findall(sqrt_regex, spherical_bf_deriv1_func)) - sqrt_finds = sqrt_finds + (re.findall(sqrt_regex, cartesian_bf_func)) - sqrt_finds = sqrt_finds + (re.findall(sqrt_regex, cartesian_bf_deriv1_func)) - - sqrt_finds = list(set(sqrt_finds)) - - for x in sqrt_finds: - arg = x.strip("sqrt(").strip(")") - new_str = "sqrt_" + arg - spherical_bf_func = spherical_bf_func.replace(x, new_str) - spherical_bf_deriv1_func = spherical_bf_deriv1_func.replace(x, new_str) - cartesian_bf_func = cartesian_bf_func.replace(x, new_str) - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func.replace(x, new_str) - - new_str = "constexpr double " + new_str + " = " + str(math.sqrt(int(arg))) + ";" - constant_lines.append(new_str) - - cart_header_file.write(cartesian_bf_func) - cart_header_file.write(cartesian_bf_deriv1_func) - sphr_header_file.write(spherical_bf_func) - sphr_header_file.write(spherical_bf_deriv1_func) - - -# Generate calling routines -cartesian_bf_calling_func = cartesian_bf_template.format("") -spherical_bf_calling_func = spherical_bf_template.format("") -cartesian_bf_deriv1_calling_func = cartesian_bf_deriv1_template.format("") -spherical_bf_deriv1_calling_func = spherical_bf_deriv1_template.format("") - -am_dispatch_template = "switch( shell.l ) {{\n" -am_dispatch_template_deriv1 = "switch( shell.l ) {{\n" -for L in range(L_max + 1): - bf_template = """ - case {0}: - gaueval_{{0}}_angular_{0}(tmp, xc, yc, zc, bf_eval); - break; -""".format(L) - - deriv1_template = """ - case {0}: - gaueval_{{0}}_angular_{0}(tmp, xc, yc, zc, bf_eval); - gaueval_{{0}}_angular_{0}_deriv1(tmp, tmp_x, tmp_y, tmp_z, xc, yc, zc, bf_eval, bf_x_eval, bf_y_eval, bf_z_eval); - break; -""".format(L) - - am_dispatch_template = am_dispatch_template + bf_template - am_dispatch_template_deriv1 = am_dispatch_template_deriv1 + deriv1_template - - -am_dispatch_template = am_dispatch_template + "}}\n" -am_dispatch_template_deriv1 = am_dispatch_template_deriv1 + "}}\n" - -print(am_dispatch_template_deriv1.format("cartesian")) -print(am_dispatch_template_deriv1.format("spherical")) - - -footer = "} // namespace GauXC" -cart_header_file.write(footer) -sphr_header_file.write(footer) - -constant_lines = list(set(constant_lines)) -preamble = """ -#pragma once - -namespace GauXC { -""" - -cons_header_file.write(preamble) -for s in constant_lines: - cons_header_file.write(" " + s + "\n") -cons_header_file.write(footer) diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/__init__.py b/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/collocation_angular.py b/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/collocation_angular.py deleted file mode 100644 index 0903a5d..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/collocation_angular.py +++ /dev/null @@ -1,259 +0,0 @@ -import cmath -import math -import os -import re -import sys -from math import factorial as fact - -import sympy -from scipy.special import binom as binomial -from sympy import I as symb_I -from sympy import exp as symb_exp -from sympy import factorial as symb_fact -from sympy import factorial2 as symb_fact2 - - -def generate_cartesian_ls(L): - l = [] - for i in range(L + 1): - lx = L - i - for j in range(i + 1): - ly = i - j - lz = L - lx - ly - - l.append([0, 0, 0]) - - for k in range(lx - 1): - l[-1][0] = l[-1][0] + 1 - for k in range(ly - 1): - l[-1][1] = l[-1][1] + 1 - for k in range(lz - 1): - l[-1][2] = l[-1][2] + 1 - - if lx > 0: - l[-1][0] = l[-1][0] + 1 - if ly > 0: - l[-1][1] = l[-1][1] + 1 - if lz > 0: - l[-1][2] = l[-1][2] + 1 - - return l - - -def generate_spherical_coeff(l, m, lx, ly, lz): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return 0.0 - - prefactor = fact(2.0 * lx) * fact(2.0 * ly) * fact(2.0 * lz) * fact(l) - prefactor = prefactor * fact(l - abs(m)) - prefactor = prefactor / (fact(2.0 * l) * fact(lx) * fact(ly) * fact(lz)) - prefactor = prefactor / fact(l + abs(m)) - prefactor = math.sqrt(prefactor) - - term1 = 0.0 - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + binomial(l, i) * binomial(i, j) * math.pow(-1, i) * fact( - 2 * l - 2 * i - ) / fact(l - abs(m) - 2 * i) - - term1 = term1 / math.pow(2, l) / fact(l) - - m_fact = 1.0 - if m < 0: - m_fact = -1.0 - - term2 = 0.0 + 0.0j - for k in range(j + 1): - z = cmath.exp(m_fact * math.pi / 2.0 * (abs(m) - lx + 2 * k) * 1.0j) - term2 = term2 + binomial(j, k) * binomial(abs(m), lx - 2 * k) * z - - val = prefactor * term1 * term2 - - if abs(val.real) < 1e-10: - val = 0.0 + val.imag * 1j - if abs(val.imag) < 1e-10: - val = val.real - - return val - - -def generate_spherical_coeff_symb(l, m, lx, ly, lz, unnorm=False): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return sympy.Integer(0) - - j_symb = sympy.Integer(j) - l_symb = sympy.Integer(l) - m_symb = sympy.Integer(abs(m)) - lx_symb = sympy.Integer(lx) - ly_symb = sympy.Integer(ly) - lz_symb = sympy.Integer(lz) - - prefactor = ( - symb_fact(2 * lx_symb) - * symb_fact(2 * ly_symb) - * symb_fact(2 * lz_symb) - * symb_fact(l_symb) - ) - prefactor = prefactor * symb_fact(l_symb - m_symb) - prefactor = prefactor / ( - symb_fact(2 * l_symb) - * symb_fact(lx_symb) - * symb_fact(ly_symb) - * symb_fact(lz_symb) - ) - prefactor = prefactor / symb_fact(l_symb + m_symb) - - # Ed's stupid normalization convention... - if unnorm: - prefactor = ( - prefactor - * symb_fact2(2 * l - 1) - / symb_fact2(2 * lx - 1) - / symb_fact2(2 * ly - 1) - / symb_fact2(2 * lz - 1) - ) - - prefactor = sympy.sqrt(prefactor) - - term1 = sympy.Integer(0) - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + sympy.Integer(binomial(l, i)) * sympy.Integer( - binomial(i, j) - ) * sympy.Integer(math.pow(-1, i)) * symb_fact( - 2 * l_symb - sympy.Integer(2 * i) - ) / symb_fact(l_symb - m_symb - sympy.Integer(2 * i)) - - term1 = term1 / (2**l_symb) / symb_fact(l) - - m_fact_symb = sympy.Integer(1) - if m < 0: - m_fact_symb = -m_fact_symb - - term2 = sympy.Integer(0) - for k in range(j + 1): - z = sympy.exp( - m_fact_symb - * sympy.pi - / 2 - * (m_symb - lx_symb + sympy.Integer(2 * k)) - * symb_I - ) - term2 = ( - term2 - + sympy.Integer(binomial(j, k)) - * sympy.Integer(binomial(abs(m), lx - 2 * k)) - * z - ) - - return prefactor * term1 * term2 - - -def generate_cartesian_angular(ls): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - - ang = [] - - for l in ls: - ang.append(r) - for i in range(l[0]): - ang[-1] = ang[-1] * x - for i in range(l[1]): - ang[-1] = ang[-1] * y - for i in range(l[2]): - ang[-1] = ang[-1] * z - - ang[-1] = ang[-1] / r - - return ang - - -def generate_spherical_angular(L, unnorm=False): - ls = generate_cartesian_ls(L) - angs = generate_cartesian_angular(ls) - - # r = sympy.symbols( 'r' ) - sph_angs = [] - for m in range(L + 1): - tmp_p = 0 - tmp_m = 0 - for i in range(len(ls)): - l = ls[i] - ang = angs[i] - - # c = generate_spherical_coeff( L, m, l[0],l[1],l[2] ) - c = generate_spherical_coeff_symb(L, m, l[0], l[1], l[2], unnorm) - - if m == 0: - tmp_p = tmp_p + c * ang - - else: - c_p = (c + sympy.conjugate(c)) / sympy.sqrt(2) - c_m = (c - sympy.conjugate(c)) / sympy.sqrt(2) / symb_I - - tmp_p = tmp_p + c_p * ang - tmp_m = tmp_m + c_m * ang - - sph_angs.append((m, tmp_p)) - if m > 0: - sph_angs.append((-m, tmp_m)) - - sph_angs = sorted(sph_angs, key=lambda x: x[0]) - - sph_angs_bare = [] - for a in sph_angs: - sph_angs_bare.append(sympy.simplify(a[1])) - - return sph_angs_bare - - -def generate_eval_lines(L, ang): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - [bf, bf_x, bf_y, bf_z] = sympy.symbols("bf bf_x bf_y bf_z", real=True) - - bf_eval_strs = [] - bf_x_eval_strs = [] - bf_y_eval_strs = [] - bf_z_eval_strs = [] - - for j in range(len(ang)): - a = ang[j] - a_x = sympy.diff(a, x) - a_y = sympy.diff(a, y) - a_z = sympy.diff(a, z) - - bf_eval = sympy.simplify(a * bf) - bf_x_eval = sympy.simplify(a_x * bf + a * bf_x) - bf_y_eval = sympy.simplify(a_y * bf + a * bf_y) - bf_z_eval = sympy.simplify(a_z * bf + a * bf_z) - - bf_eval_str = "eval[npts * {}] = {};".format(j, bf_eval) - bf_x_eval_str = "eval_x[npts * {}] = {};".format(j, bf_x_eval) - bf_y_eval_str = "eval_y[npts * {}] = {};".format(j, bf_y_eval) - bf_z_eval_str = "eval_z[npts * {}] = {};".format(j, bf_z_eval) - - if L >= 2: - for k in range(2, L + 1): - for X in ("x", "y", "z"): - pow_str = X + "**" + str(k) - repl_str = "" - for K in range(k - 1): - repl_str = repl_str + X + "*" - repl_str = repl_str + X - - bf_eval_str = bf_eval_str.replace(pow_str, repl_str) - bf_x_eval_str = bf_x_eval_str.replace(pow_str, repl_str) - bf_y_eval_str = bf_y_eval_str.replace(pow_str, repl_str) - bf_z_eval_str = bf_z_eval_str.replace(pow_str, repl_str) - - bf_eval_strs.append(bf_eval_str) - bf_x_eval_strs.append(bf_x_eval_str) - bf_y_eval_strs.append(bf_y_eval_str) - bf_z_eval_strs.append(bf_z_eval_str) - - return (bf_eval_strs, bf_x_eval_strs, bf_y_eval_strs, bf_z_eval_strs) diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/generate_collocation_angular_eval.py b/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/generate_collocation_angular_eval.py deleted file mode 100644 index af14e58..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/scripts/generate_collocation_angular_eval.py +++ /dev/null @@ -1,163 +0,0 @@ -import math -import os -import re -import sys -from io import StringIO - -import pyexpander.lib as expander -from collocation_angular import ( - generate_cartesian_angular, - generate_cartesian_ls, - generate_eval_lines, - generate_spherical_angular, -) - -L_max = 6 -if len(sys.argv) > 1: - L_max = int(sys.argv[1]) - -# sphr_bf_body = [] -# sphr_bf_d1_body = [] - -sphr_unnorm_bf_body = [] -sphr_unnorm_bf_d1_body = [] - -cart_bf_body = [] -cart_bf_d1_body = [] - - -for L in range(L_max + 1): - print("Processing L = {} ...".format(L)) - # sphr_ang = generate_spherical_angular( L, False ) - sphr_unnorm_ang = generate_spherical_angular(L, True) - cart_ang = generate_cartesian_angular(generate_cartesian_ls(L)) - - # sa, sa_x, sa_y, sa_z = generate_eval_lines( L, sphr_ang ) - sna, sna_x, sna_y, sna_z = generate_eval_lines(L, sphr_unnorm_ang) - ca, ca_x, ca_y, ca_z = generate_eval_lines(L, cart_ang) - - # sphr_bf_body.append( "\n ".join(sa) ) - sphr_unnorm_bf_body.append("\n ".join(sna)) - cart_bf_body.append("\n ".join(ca)) - - # s_d1 = "\n\n ".join(["\n ".join( sa_x ), "\n ".join(sa_y), "\n ".join(sa_z)]) - sn_d1 = "\n\n ".join(["\n ".join(sna_x), "\n ".join(sna_y), "\n ".join(sna_z)]) - c_d1 = "\n\n ".join(["\n ".join(ca_x), "\n ".join(ca_y), "\n ".join(ca_z)]) - - # sphr_bf_d1_body.append( s_d1 ) - sphr_unnorm_bf_d1_body.append(sn_d1) - cart_bf_d1_body.append(c_d1) - - -template_fname = "templates/collocation_angular_template.hpp" - -# sphr_var_dict = { 'L_max' : L_max, 'body' : sphr_bf_body, 'body_d1' : sphr_bf_d1_body, 'name' : 'spherical' } -sphr_unnorm_var_dict = { - "L_max": L_max, - "body": sphr_unnorm_bf_body, - "body_d1": sphr_unnorm_bf_d1_body, - "name": "spherical_unnorm", -} -cart_var_dict = { - "L_max": L_max, - "body": cart_bf_body, - "body_d1": cart_bf_d1_body, - "name": "cartesian", -} - - -old_sys_out = sys.stdout - -sys.stdout = cart_expand = StringIO() -expander.expandFile( - template_fname, external_definitions=cart_var_dict, auto_indent=True -) -# sys.stdout = sphr_expand = StringIO() -# expander.expandFile( template_fname, external_definitions=sphr_var_dict, auto_indent=True ) -sys.stdout = sphr_unnorm_expand = StringIO() -expander.expandFile( - template_fname, external_definitions=sphr_unnorm_var_dict, auto_indent=True -) - -sys.stdout = old_sys_out - -cart_expand = cart_expand.getvalue() -# sphr_expand = sphr_expand.getvalue() -sphr_unnorm_expand = sphr_unnorm_expand.getvalue() - - -# Handle Constants -constant_lines = [] - -# Sqrts -sqrt_regex = "sqrt\([0-9]+\)" -# sqrt_finds = re.findall( sqrt_regex, "\n".join([cart_expand,sphr_expand,sphr_unnorm_expand]) ) -sqrt_finds = re.findall(sqrt_regex, "\n".join([cart_expand, sphr_unnorm_expand])) - -sqrt_finds = list(set(sqrt_finds)) - -for x in sqrt_finds: - arg = x.strip("sqrt(").strip(")") - new_str = "sqrt_" + arg - - cart_expand = cart_expand.replace(x, new_str) - # sphr_expand = sphr_expand.replace( x, new_str ) - sphr_unnorm_expand = sphr_unnorm_expand.replace(x, new_str) - - new_str = "constexpr double " + new_str + " = " + str(math.sqrt(int(arg))) + ";" - constant_lines.append(new_str) - -old_sys_out = sys.stdout - -sys.stdout = constant_expand = StringIO() -expander.expandFile( - "templates/collocation_device_constants_template.hpp", - external_definitions={"const_lines": constant_lines}, -) - -sys.stdout = old_sys_out - -constant_expand = constant_expand.getvalue() - - -cart_header_fname = "collocation_angular_cartesian.hpp" -# sphr_header_fname = "collocation_angular_spherical.hpp" -sphr_unnorm_header_fname = "collocation_angular_spherical_unnorm.hpp" -cons_header_fname = "collocation_device_constants.hpp" - -cart_header_file = open(cart_header_fname, "w") -# sphr_header_file = open( sphr_header_fname, 'w' ) -sphr_unnorm_header_file = open(sphr_unnorm_header_fname, "w") -cons_header_file = open(cons_header_fname, "w") - -cart_header_file.write(cart_expand) -# sphr_header_file.write( sphr_expand ) -sphr_unnorm_header_file.write(sphr_unnorm_expand) -cons_header_file.write(constant_expand) - - -# Generate Kernel Driver - -# old_sys_out = sys.stdout - -# sys.stdout = collocation_cartesian_kernel_expand = StringIO() -# expander.expandFile( 'collocation_kernels_template.cu', external_definitions={ 'ang_name' : 'cartesian' } ) -# -# sys.stdout = collocation_spherical_kernel_expand = StringIO() -# expander.expandFile( 'collocation_kernels_template.cu', external_definitions={ 'ang_name' : 'spherical' } ) -# -# sys.stdout = collocation_spherical_unnorm_kernel_expand = StringIO() -# expander.expandFile( 'collocation_kernels_template.cu', external_definitions={ 'ang_name' : 'spherical_unnorm' } ) -# -# sys.stdout = old_sys_out -# -# collocation_cartesian_kernel_expand = collocation_cartesian_kernel_expand.getvalue() -# collocation_spherical_kernel_expand = collocation_spherical_kernel_expand.getvalue() -# collocation_spherical_unnorm_kernel_expand = collocation_spherical_unnorm_kernel_expand.getvalue() -# -# with open( 'collocation_kernels_cartesian.cu', 'w' ) as f: -# f.write( collocation_cartesian_kernel_expand ) -# with open( 'collocation_kernels_spherical.cu', 'w' ) as f: -# f.write( collocation_spherical_kernel_expand ) -# with open( 'collocation_kernels_spherical_unnorm.cu', 'w' ) as f: -# f.write( collocation_spherical_unnorm_kernel_expand ) diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/templates/collocation_angular_template.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation/templates/collocation_angular_template.hpp deleted file mode 100644 index f07b758..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/templates/collocation_angular_template.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace cuda { - -$for( L in range(L_max + 1) )\ -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular_$(L)( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - $(body[L]) - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular_$(L)_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - $(body_d1[L]) - -} - -$endfor\ - -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - -$for( L in range(L_max + 1) )\ - $if( L == 0 )\ - if( l == $(L) ) { - $else\ - } else if( l == $(L) ) { - $endif - collocation_$(name)_angular_$(L)( npts, bf, x, y, z, eval ); - -$endfor\ - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_$(name)_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - -$for( L in range(L_max + 1) )\ - $if( L == 0 )\ - if( l == $(L) ) { - $else\ - } else if( l == $(L) ) { - $endif - collocation_$(name)_angular_$(L)( npts, bf, x, y, z, eval ); - collocation_$(name)_angular_$(L)_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - -$endfor\ - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_$(name)_angular_deriv1 - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation/templates/collocation_device_constants_template.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation/templates/collocation_device_constants_template.hpp deleted file mode 100644 index 5245913..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation/templates/collocation_device_constants_template.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { -namespace cuda { - -$for( x in const_lines )\ - $(x) -$endfor\ - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation_device.cu b/third_party/gauxc/attic/src/integrator/cuda/collocation_device.cu deleted file mode 100644 index 2aa5bc6..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation_device.cu +++ /dev/null @@ -1,382 +0,0 @@ -#include -#include -#include -#include - -#include "cuda/collocation_petite_kernels.hpp" -#include "cuda/collocation_masked_kernels.hpp" -#include "cuda/collocation_petite_combined_kernels.hpp" -#include "cuda/collocation_masked_combined_kernels.hpp" - -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -) { - - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_petite_kernel - <<>> - ( nshells, nbf, npts, shells_device, offs_device, - pts_device, eval_device ); - -} - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - cudaStream_t stream -); - - - - - - - - - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_masked_kernel - <<>> - ( nshells, nbf, npts, shells_device, mask_device, - offs_device, pts_device, eval_device ); - -} - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - cudaStream_t stream -); - - - - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_petite_combined_kernel - <<>> - ( ntasks, device_tasks ); - -} - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - - - - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_masked_combined_kernel - <<>> - ( ntasks, shells_device, device_tasks ); - -} - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -) { - - auto nmax_threads = util::cuda_kernel_max_threads_per_block( - collocation_device_petite_kernel_deriv1 - ); - - dim3 threads(warp_size, nmax_threads/warp_size, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_petite_kernel_deriv1 - <<>> - ( nshells, nbf, npts, shells_device, offs_device, - pts_device, eval_device, deval_device_x, deval_device_y, - deval_device_z ); - -} - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z, - cudaStream_t stream -); - - - - - - - - - - - - - - - - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_masked_kernel_deriv1 - <<>> - ( nshells, nbf, npts, shells_device, mask_device, offs_device, - pts_device, eval_device, deval_device_x, deval_device_y, - deval_device_z ); - -} - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z, - cudaStream_t stream -); - - - - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_petite_combined_kernel_deriv1 - <<>> - ( ntasks, device_tasks ); - -} - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - auto nmax_threads = util::cuda_kernel_max_threads_per_block( - collocation_device_masked_combined_kernel_deriv1 - ); - - dim3 threads(warp_size, nmax_threads/warp_size, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_masked_combined_kernel_deriv1 - <<>> - ( ntasks, shells_device, device_tasks ); - -} - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation_device.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation_device.hpp deleted file mode 100644 index 9a8957b..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation_device.hpp +++ /dev/null @@ -1,109 +0,0 @@ -#pragma once -#include -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -); - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -); - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -); - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -); - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation_masked_combined_kernels.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation_masked_combined_kernels.hpp deleted file mode 100644 index 5393d39..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation_masked_combined_kernels.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#include -#include - -#include -#include - -#include "cuda/collocation/collocation_angular_cartesian.hpp" -#include "cuda/collocation/collocation_angular_spherical_unnorm.hpp" -#include "cuda/cuda_alg_variant_control.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -__global__ -void collocation_device_masked_combined_kernel( - size_t ntasks, - Shell* __restrict__ shells_device, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( blockIdx.z < ntasks ) { - - auto& task = device_tasks[ blockIdx.z ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ mask_device = task.shell_list; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - - if( tid_x < npts and tid_y < nshells ) { - - const uint32_t ipt = tid_x; - const uint32_t ish = tid_y; - const uint32_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - auto tmp = 0.; - for( uint32_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } // shell / point idx check - - } // Batch idx check - -} - - - - - - - - - - - - - - - -template -__global__ -void collocation_device_masked_combined_kernel_deriv1( - size_t ntasks, - Shell* __restrict__ shells_device, - XCTaskDevice* __restrict__ device_tasks -) { - - // DBWY: These are factored into the loop for this optimization - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( blockIdx.z < ntasks ) { - - auto& task = device_tasks[ blockIdx.z ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ mask_device = task.shell_list; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - auto* __restrict__ deval_device_x = task.dbfx; - auto* __restrict__ deval_device_y = task.dbfy; - auto* __restrict__ deval_device_z = task.dbfz; - - if( tid_y < nshells and tid_x < npts ) { - - const uint32_t ish = tid_y; - const uint32_t ipt = tid_x; - const uint32_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( uint32_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, - tmp_z, xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } // shell / point idx check - } // Batch idx check - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation_masked_kernels.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation_masked_kernels.hpp deleted file mode 100644 index fcc26e6..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation_masked_kernels.hpp +++ /dev/null @@ -1,155 +0,0 @@ -#include -#include - -#include - -#include "cuda/collocation/collocation_angular_cartesian.hpp" -#include "cuda/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - - -template -__global__ -void collocation_device_masked_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ mask_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } - -} - - - - - - - - -template -__global__ -void collocation_device_masked_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ mask_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device, - T* __restrict__ deval_device_x, - T* __restrict__ deval_device_y, - T* __restrict__ deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation_petite_combined_kernels.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation_petite_combined_kernels.hpp deleted file mode 100644 index 6741586..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation_petite_combined_kernels.hpp +++ /dev/null @@ -1,186 +0,0 @@ -#include -#include - -#include -#include - -#include "cuda/collocation/collocation_angular_cartesian.hpp" -#include "cuda/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -__global__ -void collocation_device_petite_combined_kernel( - size_t ntasks, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - const int batch_id = blockIdx.z; - - if( batch_id < ntasks ) { - - auto& task = device_tasks[ batch_id ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ shells_device = task.shells; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* eval_device = task.bf; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } // shell / point idx check - - } // Batch idx check - -} - - - - - - - - - - - - - - - -template -__global__ -void collocation_device_petite_combined_kernel_deriv1( - size_t ntasks, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - const int batch_id = blockIdx.z; - - if( batch_id < ntasks ) { - - auto& task = device_tasks[ batch_id ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ shells_device = task.shells; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - auto* __restrict__ deval_device_x = task.dbfx; - auto* __restrict__ deval_device_y = task.dbfy; - auto* __restrict__ deval_device_z = task.dbfz; - - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, - tmp_z, xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } // shell / point idx check - - } // Batch idx check - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/collocation_petite_kernels.hpp b/third_party/gauxc/attic/src/integrator/cuda/collocation_petite_kernels.hpp deleted file mode 100644 index 63a2c03..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/collocation_petite_kernels.hpp +++ /dev/null @@ -1,160 +0,0 @@ -#include -#include - -#include - -#include "cuda/collocation/collocation_angular_cartesian.hpp" -#include "cuda/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - - - -template -__global__ -void collocation_device_petite_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } - -} - - - - - - - - - - - - - - - -template -__global__ -void collocation_device_petite_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device, - T* __restrict__ deval_device_x, - T* __restrict__ deval_device_y, - T* __restrict__ deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/integrator/cuda/cublas_extensions.cu b/third_party/gauxc/attic/src/integrator/cuda/cublas_extensions.cu deleted file mode 100644 index 3858de2..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cublas_extensions.cu +++ /dev/null @@ -1,153 +0,0 @@ -#include "cuda/cublas_extensions.hpp" -#include -#include -#include - -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace cuda { -namespace blas { - -using namespace GauXC::cuda; - -template -__global__ void increment_kernel( const T* X, T* Y ) { - const auto tid = blockIdx.x; - if( tid < 1 ) (*Y) += (*X); -} - -template -void increment( const T* X, T* Y, cudaStream_t stream ) { - increment_kernel<<<1,1,0,stream>>>(X,Y); -} - -template <> -void dot( cublasHandle_t handle, - int N, - const double* X, - int INCX, - const double* Y, - int INCY, - double* RES ) { - - auto stat = cublasDdot( handle, N, X, INCX, Y, INCY, RES ); - GAUXC_CUBLAS_ERROR("CUBLAS DDOT FAILED", stat ); - -} - -template -void gdot( cublasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* SCR, - T* RES ) { - - dot( handle, N, X, INCX, Y, INCY, SCR ); - auto stream = util::get_stream(handle); - increment( SCR, RES, stream ); - -} - -template -void gdot( cublasHandle_t handle, - int N, - const double* X, - int INCX, - const double* Y, - int INCY, - double* SCR, - double* RES ); - - - - - - - - - - -template -void __global__ hadamard_product_kernel( int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < M and tid_y < N ) { - B[ tid_x + tid_y*LDB ] *= A[ tid_x + tid_y*LDA ]; - } - -} - - - -template -void hadamard_product( cublasHandle_t handle, - int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ) { - - auto stream = util::get_stream(handle); - dim3 threads(warp_size, max_warps_per_thread_block); - dim3 blocks( util::div_ceil( M, threads.x ), - util::div_ceil( N, threads.y ) ); - - hadamard_product_kernel<<< blocks, threads, 0, stream >>>( M, N, A, LDA, B, LDB ); - -} - -template -void hadamard_product( cublasHandle_t handle, - int M, - int N, - const double* A, - int LDA, - double* B, - int LDB ); - - - - -template <> -void gemm( cublasHandle_t handle, - cublasOperation_t TA, cublasOperation_t TB, - int M, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, - double BETA, double* C, int LDC ) { - - auto stat = cublasDgemm( handle, TA, TB, M, N, K, &ALPHA, A, LDA, - B, LDB, &BETA, C, LDC ); - GAUXC_CUBLAS_ERROR("CUBLAS DGEMM FAILED", stat); - -} - - -template <> -void syr2k( cublasHandle_t handle, - cublasFillMode_t UPLO, cublasOperation_t Trans, - int M, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, - double BETA, double* C, int LDC ) { - - auto stat = cublasDsyr2k( handle, UPLO, Trans, M, K, &ALPHA, A, LDA, B, LDB, - &BETA, C, LDC ); - GAUXC_CUBLAS_ERROR("CUBLAS DSYR2K FAILED", stat); - -} - -} -} -} - diff --git a/third_party/gauxc/attic/src/integrator/cuda/cublas_extensions.hpp b/third_party/gauxc/attic/src/integrator/cuda/cublas_extensions.hpp deleted file mode 100644 index 81af06d..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cublas_extensions.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace cuda { -namespace blas { - -template -void dot( cublasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* RES ); - -template -void gdot( cublasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* SCR, - T* RES ); - - -template -void hadamard_product( cublasHandle_t handle, - int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ); - - -template -void gemm( cublasHandle_t handle, - cublasOperation_t TA, cublasOperation_t TB, - int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, - T BETA, T* C, int LDC ); - -template -void syr2k( cublasHandle_t handle, - cublasFillMode_t UPLO, cublasOperation_t Trans, - int M, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, - T BETA, T* C, int LDC ); -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_alg_variant_control.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_alg_variant_control.hpp deleted file mode 100644 index e0d1f9b..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_alg_variant_control.hpp +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -//#define GAUXC_CUDA_ENABLE_COLLOCATION_SHMEM_COPY -//#define GAUXC_CUDA_ENABLE_COMPACT_COLLOCATION diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_device_properties.cxx b/third_party/gauxc/attic/src/integrator/cuda/cuda_device_properties.cxx deleted file mode 100644 index 2172a15..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_device_properties.cxx +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "cuda_runtime.h" - -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace cuda { - - -uint32_t get_submat_cut_block(int32_t LDA, int32_t device) { - int l2_cache_size; - cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device); - - int l2_block_size = (int) sqrt(0.75 * ((double) l2_cache_size / 8)); - int min_block_size = LDA / max_submat_blocks; - - int block_size = std::max(l2_block_size, min_block_size); - block_size = std::min(block_size, LDA); - - return block_size; -} - -uint32_t get_device_sm_count(int32_t device) { - int num_sm; - cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, device); - - return num_sm; -} - -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_device_properties.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_device_properties.hpp deleted file mode 100644 index 0b80a00..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_device_properties.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace cuda { - -static constexpr uint32_t warp_size = 32; -static constexpr uint32_t max_threads_per_thread_block = 1024; -static constexpr uint32_t max_warps_per_thread_block = - max_threads_per_thread_block / warp_size; - -static constexpr uint32_t max_submat_blocks = 10; - -// Properties for weight algorithm -static constexpr uint32_t weight_unroll = 4; -static_assert(weight_unroll == 4, "Weight unroll is only tested for value of 4"); -static constexpr uint32_t weight_thread_block = 640; -static constexpr uint32_t weight_thread_block_per_sm = 2; - -uint32_t get_submat_cut_block(int32_t LDA, int32_t device); -uint32_t get_device_sm_count(int32_t device); -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_driver_replicated_density_incore.cxx b/third_party/gauxc/attic/src/integrator/cuda/cuda_driver_replicated_density_incore.cxx deleted file mode 100644 index 937f3fc..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_driver_replicated_density_incore.cxx +++ /dev/null @@ -1,460 +0,0 @@ -#include -#include -#include - -#include "cuda/cuda_weights.hpp" -#include "cuda/collocation_device.hpp" -#include "cuda/cuda_pack_density.hpp" -#include "cuda/cuda_inc_potential.hpp" -#include "cuda/cuda_eval_denvars.hpp" -#include "cuda/cuda_zmat.hpp" -#include "integrator_common.hpp" - -#include "cuda/cublas_extensions.hpp" - -namespace GauXC { -namespace integrator::cuda { - -using namespace GauXC::cuda::blas; - - -template -using cuda_task_iterator = typename std::vector>::iterator; - -template -void process_batches_cuda_replicated_density_incore( - XCWeightAlg weight_alg, - const functional_type& func, - XCCudaData& cuda_data, - cuda_task_iterator task_begin, - cuda_task_iterator task_end -) { - - const auto ntasks = std::distance( task_begin, task_end ); - const auto nbf = cuda_data.nbf; - - // Get batch statistics for batches to process - auto nbe_comparator = - []( const auto& a, const auto& b ){ return a.nbe < b.nbe; }; - auto npts_comparator = - []( const auto& a, const auto& b ){ return a.npts < b.npts; }; - auto nshells_comparator = - []( const auto& a, const auto& b ){ return a.nshells < b.nshells; }; - - auto [min_nbe_it, max_nbe_it] = - std::minmax_element( task_begin, task_end, nbe_comparator ); - auto [min_npts_it, max_npts_it] = - std::minmax_element( task_begin, task_end, npts_comparator ); - auto [min_nshells_it, max_nshells_it] = - std::minmax_element( task_begin, task_end, nshells_comparator ); - - const auto min_nbe = min_nbe_it->nbe; - const auto max_nbe = max_nbe_it->nbe; - const auto min_npts = min_npts_it->npts; - const auto max_npts = max_npts_it->npts; - const auto min_nshells = min_nshells_it->nshells; - const auto max_nshells = max_nshells_it->nshells; - - util::unused( min_nbe, min_npts, min_nshells ); - - const size_t total_npts = - std::accumulate( task_begin, task_end, 0ul, - []( const auto& a, const auto& b ) { return a + b.npts; } ); - - - // Aliases - cudaStream_t master_stream = *cuda_data.master_stream; - cublasHandle_t master_handle = *cuda_data.master_handle; - -#ifdef GAUXC_ENABLE_MAGMA - magma_queue_t master_queue = *cuda_data.master_magma_queue; -#endif - - auto* dmat_device = cuda_data.dmat_device; - - auto* shells_device = cuda_data.shells_device; - auto* tasks_device = cuda_data.device_tasks; - auto* dmat_array_device = cuda_data.dmat_array_device; - auto* zmat_array_device = cuda_data.zmat_array_device; - auto* bf_array_device = cuda_data.bf_array_device; - auto* weights_device = cuda_data.weights_device_buffer; - auto* dist_scratch_device = cuda_data.dist_scratch_device; - - auto* den_eval_device = cuda_data.den_eval_device; - auto* dden_x_eval_device = cuda_data.den_x_eval_device; - auto* dden_y_eval_device = cuda_data.den_y_eval_device; - auto* dden_z_eval_device = cuda_data.den_z_eval_device; - - auto* eps_eval_device = cuda_data.eps_eval_device; - auto* gamma_eval_device = cuda_data.gamma_eval_device; - auto* vrho_eval_device = cuda_data.vrho_eval_device; - auto* vgamma_eval_device = cuda_data.vgamma_eval_device; - - - auto* exc_device = cuda_data.exc_device; - auto* vxc_device = cuda_data.vxc_device; - auto* nel_device = cuda_data.nel_device; - auto* acc_scr_device = cuda_data.acc_scr_device; - - auto* m_array_device = cuda_data.m_array_device; - auto* n_array_device = cuda_data.n_array_device; - auto* k_array_device = cuda_data.k_array_device; - auto* lda_array_device = cuda_data.lda_array_device; - auto* ldb_array_device = cuda_data.ldb_array_device; - auto* ldc_array_device = cuda_data.ldc_array_device; - - - const auto* rab_device = cuda_data.rab_device; - const auto* coords_device = cuda_data.coords_device; - const auto* points_device = cuda_data.points_device_buffer; - const auto* iparent_device = cuda_data.iparent_device_buffer; - const auto* dist_nearest_device = cuda_data.dist_nearest_buffer; - - - - - // Evaluate Partition Weights - partition_weights_cuda_SoA( weight_alg, total_npts, cuda_data.LDatoms, cuda_data.natoms, - points_device, iparent_device, dist_nearest_device, - rab_device, coords_device, weights_device, - dist_scratch_device, master_stream ); - - - // Evaluate Collocation - if constexpr ( n_deriv == 1 ) - eval_collocation_masked_combined_deriv1( ntasks, max_npts, max_nshells, - shells_device, tasks_device, - master_stream ); - else - eval_collocation_masked_combined( ntasks, max_npts, max_nshells, shells_device, - tasks_device, master_stream ); - - // Pack Density Submatrices - task_pack_density_matrix( ntasks, tasks_device, dmat_device, nbf, master_stream ); - - - // Form Z = P * X - if( cuda_data.batch_l3_blas ) { - -#ifdef GAUXC_ENABLE_MAGMA - - magmablas_dgemm_vbatched( MagmaNoTrans, MagmaNoTrans, - m_array_device, n_array_device, k_array_device, - 1., bf_array_device, ldb_array_device, - dmat_array_device, lda_array_device, - 0., zmat_array_device, ldc_array_device, - ntasks, master_queue ); - -#else - - throw std::runtime_error("BATCHED BLAS API NOT SUPPORTED"); - -#endif - - } else { - - int nstream = cuda_data.blas_streams.size(); - - // Wait for collocation etc - util::cuda_event master_event; - master_event.record( master_stream ); - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.blas_streams[iS].wait( master_event ); - - // Do GEMM in round-robin - for( auto iT = 0; iT < ntasks; ++iT ) { - auto& task = *(task_begin + iT); - gemm( cuda_data.blas_handles[iT % nstream], CUBLAS_OP_N, CUBLAS_OP_N, - task.npts, task.nbe, task.nbe, 1., task.bf, task.npts, - task.nbe_scr, task.nbe, 0., task.zmat, task.npts ); - } - - // Record completion of BLAS ops - std::vector< util::cuda_event > blas_events( nstream ); - for( int iS = 0; iS < nstream; ++iS ) - blas_events[iS].record( cuda_data.blas_streams[iS] ); - - // Wait on master stream for all BLAS ops to complete - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.master_stream->wait( blas_events[iS] ); - - } - - - - // Zero UVars - util::cuda_set_zero_async( total_npts, den_eval_device, master_stream, "DenZero" ); - if( func.is_gga() ) { - util::cuda_set_zero_async( total_npts, dden_x_eval_device, master_stream, - "DenXZero" ); - util::cuda_set_zero_async( total_npts, dden_y_eval_device, master_stream, - "DenYZero" ); - util::cuda_set_zero_async( total_npts, dden_z_eval_device, master_stream, - "DenZZero" ); - } - - // Evaluate UVars - if( func.is_gga() ) { - eval_uvars_gga_device( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - eval_vvars_gga_device( total_npts, dden_x_eval_device, dden_y_eval_device, - dden_z_eval_device, gamma_eval_device, master_stream ); - } else { - eval_uvars_lda_device( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - } - - // Evaluate XC Functional - if( func.is_gga() ) - func.eval_exc_vxc_device( total_npts, den_eval_device, gamma_eval_device, - eps_eval_device, vrho_eval_device, - vgamma_eval_device, master_stream ); - else - func.eval_exc_vxc_device( total_npts, den_eval_device, eps_eval_device, - vrho_eval_device, master_stream ); - - - // Factor weights into XC output - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - eps_eval_device, 1 ); - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - vrho_eval_device, 1 ); - if( func.is_gga() ) - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - vgamma_eval_device, 1 ); - - // Accumulate EXC / NEL - gdot( master_handle, total_npts, weights_device, 1, - den_eval_device, 1, acc_scr_device, nel_device ); - gdot( master_handle, total_npts, eps_eval_device, 1, - den_eval_device, 1, acc_scr_device, exc_device ); - - // Evaluate Z Matrix - if( func.is_gga() ) - zmat_gga_cuda( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - else - zmat_lda_cuda( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - - - - // Accumulate packed VXC = X * Z**T + Z * X**T - - - if( cuda_data.batch_l3_blas ) { - -#ifdef GAUXC_ENABLE_MAGMA - - // XXX: Only updates LT - magmablas_dsyr2k_vbatched( MagmaLower, MagmaTrans, - n_array_device, m_array_device, - 1., bf_array_device, ldb_array_device, - zmat_array_device, ldc_array_device, - 0., dmat_array_device, lda_array_device, - ntasks, master_queue ); - -#else - - throw std::runtime_error("BATCHED BLAS API NOT SUPPORTED"); - -#endif - } else { - - int nstream = cuda_data.blas_streams.size(); - - // Wait for zmat, etc - util::cuda_event master_event; - master_event.record( master_stream ); - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.blas_streams[iS].wait( master_event ); - - // Do SYR2K in round-robin - for( auto iT = 0; iT < ntasks; ++iT ) { - auto& task = *(task_begin + iT); - syr2k( cuda_data.blas_handles[iT % nstream], CUBLAS_FILL_MODE_LOWER, - CUBLAS_OP_T, task.nbe, task.npts, 1., task.bf, task.npts, - task.zmat, task.npts, 0., task.nbe_scr, task.nbe ); - } - - // Record completion of BLAS ops - std::vector< util::cuda_event > blas_events( nstream ); - for( int iS = 0; iS < nstream; ++iS ) - blas_events[iS].record( cuda_data.blas_streams[iS] ); - - // Wait on master stream for all BLAS ops to complete - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.master_stream->wait( blas_events[iS] ); - } - - // Increment global VXC - task_inc_potential( ntasks, tasks_device, vxc_device, nbf, master_stream ); - - - // Synchronize on master stream - // XXX: There's no lifetime issues in this driver, should look into - // avoid this sync to allow for overlap with the host packing - cudaStreamSynchronize( master_stream ); - -} - - -template -void process_batches_cuda_replicated_density_incore_p( - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCCudaData & cuda_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* EXC, - F* NEL -) { - - auto task_comparator = []( const XCTask& a, const XCTask& b ) { - return (a.points.size() * a.nbe) > (b.points.size() * b.nbe); - }; - std::sort( local_work_begin, local_work_end, task_comparator ); - - - const auto nbf = basis.nbf(); - const auto natoms = meta.natoms(); - const auto LDatoms = cuda_data.LDatoms; - - // Send static data to the device - - // Density - util::cuda_copy( nbf * nbf, cuda_data.dmat_device, P, "P H2D" ); - - // Shells: TODO avoid host copy? - std::vector> shells( basis ); - util::cuda_copy( shells.size(), cuda_data.shells_device, shells.data(), - "Shells H2D" ); - - // RAB - util::cuda_copy_2d( cuda_data.rab_device, LDatoms * sizeof(F), - meta.rab().data(), natoms * sizeof(F), - natoms * sizeof(F), natoms, "RAB H2D"); - // This could probably happen on the host - cuda_reciprocal(natoms * LDatoms, cuda_data.rab_device, 0); - - // Atomic coordinates - std::vector coords( 3*natoms ); - for( auto i = 0ul; i < natoms; ++i ) { - coords[ 3*i + 0 ] = mol[i].x; - coords[ 3*i + 1 ] = mol[i].y; - coords[ 3*i + 2 ] = mol[i].z; - } - util::cuda_copy( 3 * natoms, cuda_data.coords_device, coords.data(), - "Coords H2D" ); - - - // Zero out XC quantities - util::cuda_set_zero( nbf * nbf, cuda_data.vxc_device, "VXC Zero" ); - util::cuda_set_zero( 1 , cuda_data.exc_device, "EXC Zero" ); - util::cuda_set_zero( 1 , cuda_data.nel_device, "NEL Zero" ); - - - - // Processes batches in groups that saturadate available device memory - auto task_it = local_work_begin; - while( task_it != local_work_end ) { - - // Determine next task batch, send relevant data to device - auto [it, tasks_device] = - cuda_data.generate_buffers( basis, task_it, local_work_end ); - - - // Process the batches - process_batches_cuda_replicated_density_incore( - weight_alg, func, cuda_data, tasks_device.begin(), tasks_device.end() - ); - - task_it = it; - - } - - symmetrize_matrix(nbf, nbf, cuda_data.vxc_device, *cuda_data.master_stream); - cudaStreamSynchronize( *cuda_data.master_stream ); -} - -#ifdef GAUXC_ENABLE_NCCL -template -void device_allreduce( - ncclComm_t nccl_comm, - XCCudaData & cuda_data -) { - cudaStream_t master_stream = *cuda_data.master_stream; - const auto nbf = cuda_data.nbf; - - ncclAllReduce((const void*)cuda_data.vxc_device, - (void*) cuda_data.vxc_device, - nbf * nbf, ncclDouble, ncclSum, nccl_comm, master_stream); - - ncclAllReduce((const void*)cuda_data.exc_device, - (void*) cuda_data.exc_device, - 1, ncclDouble, ncclSum, nccl_comm, master_stream); - - ncclAllReduce((const void*)cuda_data.nel_device, - (void*) cuda_data.nel_device, - 1, ncclDouble, ncclSum, nccl_comm, master_stream); - - cudaStreamSynchronize(master_stream); -} -#endif - -template -void device_transfer( - XCCudaData & cuda_data, - F* VXC, - F* EXC, - F* NEL -) { - const auto nbf = cuda_data.nbf; - - // Receive XC terms from host - if( not cuda_data.vxcinc_host ) - util::cuda_copy( nbf * nbf, VXC, cuda_data.vxc_device, "VXC D2H" ); - util::cuda_copy( 1, EXC, cuda_data.exc_device, "EXC D2H" ); - util::cuda_copy( 1, NEL, cuda_data.nel_device, "NEL D2H" ); - -} - - -#define CUDA_IMPL( F, ND ) \ -template \ -void process_batches_cuda_replicated_density_incore_p(\ - XCWeightAlg weight_alg,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCCudaData & cuda_data,\ - host_task_iterator local_work_begin,\ - host_task_iterator local_work_end,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -CUDA_IMPL( double, 0 ); -CUDA_IMPL( double, 1 ); - -#ifdef GAUXC_ENABLE_NCCL -template void device_allreduce( - ncclComm_t nccl_comm, - XCCudaData& cuda_data -); -#endif - -template void device_transfer( - XCCudaData& cuda_data, - double* VXC, - double* EXC, - double* NEL -); - -} -} - diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_driver_replicated_density_shellbatched.cxx b/third_party/gauxc/attic/src/integrator/cuda/cuda_driver_replicated_density_shellbatched.cxx deleted file mode 100644 index 6a56fa2..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_driver_replicated_density_shellbatched.cxx +++ /dev/null @@ -1,587 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "cuda/cuda_weights.hpp" -#include "cuda/collocation_device.hpp" -#include "cuda/cuda_pack_density.hpp" -#include "cuda/cuda_inc_potential.hpp" -#include "cuda/cuda_eval_denvars.hpp" -#include "cuda/cuda_zmat.hpp" -#include "integrator_common.hpp" - -#include "cuda/cublas_extensions.hpp" - -#include "host/util.hpp" - -namespace GauXC { -namespace integrator::cuda { - -using namespace GauXC::cuda::blas; - -auto ranges_from_list( const std::vector& shell_list ) { - - std::vector< std::pair > ranges; - ranges.emplace_back( shell_list.front(), shell_list.back() ); - - for( auto it = shell_list.begin(); it != shell_list.end()-1; ++it ) { - if( *(it+1) - *it != 1 ) { - ranges.back().second = *it; - ranges.emplace_back( *(it+1), shell_list.back() ); - } - } - - return ranges; - -} - - -// Checks if B is a subset of A -template -inline auto list_subset( const C1& A, const C2& B ) { - return std::includes( A.begin(), A.end(), B.begin(), B.end() ); -} - -template -inline auto integral_list_intersect( const std::vector& A, - const std::vector& B ) { - - - constexpr size_t sz_ratio = 100; - const size_t A_sz = A.size(); - const size_t B_sz = B.size(); - - const auto A_begin = A.begin(); - const auto A_end = A.end(); - const auto B_begin = B.begin(); - const auto B_end = B.end(); - - // Fall through if query list is much larger than max list - if( A_sz * sz_ratio < B_sz ) { - for( const auto& val : A ) { - if( std::binary_search( B_begin, B_end, val ) ) - return true; - } - return false; - } - - // Fall through if max list is much larger than query list - if( B_sz * sz_ratio < A_sz ) { - for( const auto& val : B ) { - if( std::binary_search( A_begin, A_end, val ) ) - return true; - } - return false; - } - - // Default if lists are about the same size - auto B_it = B_begin; - auto A_it = A_begin; - - while( B_it != B_end and A_it != A_end ) { - - if( *B_it < *A_it ) { - B_it = std::lower_bound( B_it, B_end, *A_it ); - continue; - } - - if( *A_it < *B_it ) { - A_it = std::lower_bound( A_it, A_end, *B_it ); - continue; - } - - return true; - - } - - return false; - - -} - - - - - - -template -inline auto integral_list_intersect( const std::vector& A, - const std::vector& B, - const uint32_t overlap_threshold_spec ) { - - const uint32_t max_intersect_sz = std::min(A.size(), B.size()); - const uint32_t overlap_threshold = std::min( max_intersect_sz, - overlap_threshold_spec ); - - constexpr size_t sz_ratio = 100; - const size_t A_sz = A.size(); - const size_t B_sz = B.size(); - - const auto A_begin = A.begin(); - const auto A_end = A.end(); - const auto B_begin = B.begin(); - const auto B_end = B.end(); - - uint32_t overlap_count = 0; - - // Fall through if query list is much larger than max list - if( A_sz * sz_ratio < B_sz ) { - - for( const auto& val : A ) { - overlap_count += !!std::binary_search( B_begin, B_end, val ); - if( overlap_count == overlap_threshold ) return true; - } - return false; - - } - - // Fall through if max list is much larger than query list - if( B_sz * sz_ratio < A_sz ) { - for( const auto& val : B ) { - overlap_count += !!std::binary_search( A_begin, A_end, val ); - if( overlap_count == overlap_threshold ) return true; - } - return false; - } - - // Default if lists are about the same size - auto B_it = B_begin; - auto A_it = A_begin; - - while( B_it != B_end and A_it != A_end ) { - - if( *B_it < *A_it ) { - B_it = std::lower_bound( B_it, B_end, *A_it ); - continue; - } - - if( *A_it < *B_it ) { - A_it = std::lower_bound( A_it, A_end, *B_it ); - continue; - } - - // *A_it == *B_it if code reaches here - overlap_count++; - A_it++; B_it++; // Increment iterators - if( overlap_count == overlap_threshold) return true; - - } - - return false; - - -} - - - -struct dev_ex_task { - host_task_iterator task_begin; - host_task_iterator task_end; - std::vector shell_list; -}; - - - - -dev_ex_task generate_dev_batch( const uint32_t nbf_threshold, - host_task_iterator task_begin, - host_task_iterator local_work_end, - const BasisSet& basis, - util::Timer& timer ) { - - - auto nbe_comparator = []( const auto& task_a, const auto& task_b ) { - return task_a.nbe < task_b.nbe; - }; - - // Find task with largest NBE - auto max_task = timer.time_op_accumulate("XCIntegrator.MaxTask", [&]() { - return std::max_element( task_begin, local_work_end, nbe_comparator ); - } ); - - const auto max_shell_list = max_task->shell_list; // copy for reset - - // Init uniion shell list to max shell list outside of loop - std::set union_shell_set(max_shell_list.begin(), - max_shell_list.end()); - - - - size_t n_overlap_pthresh = 20; - double overlap_pthresh_delta = 1. / n_overlap_pthresh; - std::vector overlap_pthresh; - for( int i = 1; i < n_overlap_pthresh; ++i ) - overlap_pthresh.emplace_back( i*overlap_pthresh_delta ); - - std::vector overlap_pthresh_idx( overlap_pthresh.size() ); - std::iota( overlap_pthresh_idx.begin(), overlap_pthresh_idx.end(), 0 ); - - std::map> - cached_task_ends; - - int cur_partition_pthresh_idx = -1; - - auto _it = std::partition_point( overlap_pthresh_idx.rbegin(), - overlap_pthresh_idx.rend(), - [&](int idx) { - - uint32_t overlap_threshold = - std::max(1., max_shell_list.size() * overlap_pthresh[idx] ); - - - host_task_iterator search_st = task_begin; - host_task_iterator search_en = local_work_end; - - // Make a local copy of union list - std::set local_union_shell_set; - - // Attempt to limit task search based on current partition - if( cur_partition_pthresh_idx >= 0 ) { - - const auto& last_pthresh = - cached_task_ends.at(cur_partition_pthresh_idx); - - if( cur_partition_pthresh_idx > idx ) { - search_st = last_pthresh.first; - local_union_shell_set = last_pthresh.second; - } else { - search_en = last_pthresh.first; - local_union_shell_set = union_shell_set; - } - - } else { - local_union_shell_set = union_shell_set; - } - - - // Partition tasks into those which overlap max_task up to - // specified threshold - auto task_end = - timer.time_op_accumulate("XCIntegrator.TaskIntersection", [&]() { - return std::partition( search_st, search_en, [&](const auto& t) { - return integral_list_intersect( max_shell_list, t.shell_list, - overlap_threshold ); - } ); - } ); - - - - // Take union of shell list for all overlapping tasks - timer.time_op_accumulate("XCIntegrator.ShellListUnion",[&]() { - for( auto task_it = search_st; task_it != task_end; ++task_it ) { - local_union_shell_set.insert( task_it->shell_list.begin(), - task_it->shell_list.end() ); - } - } ); - - auto cur_nbe = basis.nbf_subset( local_union_shell_set.begin(), - local_union_shell_set.end() ); - - //std::cout << " Threshold % = " << std::setw(5) << overlap_pthresh[idx] << ", "; - //std::cout << " Overlap Threshold = " << std::setw(8) << overlap_threshold << ", "; - //std::cout << " Current NBE = " << std::setw(8) << cur_nbe << std::endl; - - // Cache the data - cached_task_ends[idx] = std::make_pair( task_end, local_union_shell_set ); - - // Update partitioned threshold - cur_partition_pthresh_idx = idx; - - return cur_nbe < nbf_threshold; - - } ); - - host_task_iterator task_end; - auto _idx_partition = (_it == overlap_pthresh_idx.rend()) ? 0 : *_it; - std::tie( task_end, union_shell_set ) = cached_task_ends.at(_idx_partition); - - - - - - //std::cout << "FOUND " << std::distance( task_begin, task_end ) - // << " OVERLAPPING TASKS" << std::endl; - - - std::vector union_shell_list( union_shell_set.begin(), - union_shell_set.end() ); - - // Try to add additional tasks given current union list - task_end = timer.time_op_accumulate("XCIntegrator.SubtaskGeneration", [&]() { - return std::partition( task_end, local_work_end, [&]( const auto& t ) { - return list_subset( union_shell_list, t.shell_list ); - } ); - } ); - - //std::cout << "FOUND " << std::distance( task_begin, task_end ) - // << " SUBTASKS" << std::endl; - - - dev_ex_task ex_task; - ex_task.task_begin = task_begin; - ex_task.task_end = task_end; - ex_task.shell_list = std::move( union_shell_list ); - - return ex_task; - -} - -template -void device_execute_shellbatched( - util::Timer& timer, - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCCudaData & cuda_data, - const F* P, - F* VXC, - F* EXC, - F* NEL, - const dev_ex_task& ex_task_obj -) { - - // Alias information - auto task_begin = ex_task_obj.task_begin; - auto task_end = ex_task_obj.task_end; - auto& union_shell_list = ex_task_obj.shell_list; - - const auto natoms = mol.natoms(); - - // Extract subbasis - BasisSet basis_subset; basis_subset.reserve(union_shell_list.size()); - timer.time_op_accumulate("XCIntegrator.CopySubBasis",[&]() { - for( auto i : union_shell_list ) { - basis_subset.emplace_back( basis.at(i) ); - } - basis_subset.generate_shell_to_ao(); - }); - - const size_t nshells = basis_subset.size(); - const size_t nbe = basis_subset.nbf(); - std::cout << "TASK_UNION HAS:" << std::endl - << " NSHELLS = " << nshells << std::endl - << " NBE = " << nbe << std::endl; - - // Recalculate shell_list based on subbasis - timer.time_op_accumulate("XCIntegrator.RecalcShellList",[&]() { - for( auto _it = task_begin; _it != task_end; ++_it ) { - auto union_list_idx = 0; - auto& cur_shell_list = _it->shell_list; - for( auto j = 0; j < cur_shell_list.size(); ++j ) { - while( union_shell_list[union_list_idx] != cur_shell_list[j] ) - union_list_idx++; - cur_shell_list[j] = union_list_idx; - } - } - } ); - - - - // Allocate host temporaries - std::vector P_submat_host(nbe*nbe), VXC_submat_host(nbe*nbe); - F EXC_tmp, NEL_tmp; - F* P_submat = P_submat_host.data(); - F* VXC_submat = VXC_submat_host.data(); - - // Extract subdensity - auto [union_submat_cut, foo] = - integrator::gen_compressed_submat_map( basis, union_shell_list, - basis.nbf(), basis.nbf() ); - - timer.time_op_accumulate("XCIntegrator.ExtractSubDensity",[&]() { - detail::submat_set( basis.nbf(), basis.nbf(), nbe, nbe, P, basis.nbf(), - P_submat, nbe, union_submat_cut ); - } ); - - - // Allocate static quantities on device stack - cuda_data.allocate_static_data( natoms, n_deriv, nbe, nshells ); - - - // Process batches on device with subobjects - process_batches_cuda_replicated_density_incore_p( - weight_alg, func, basis_subset, mol, meta, cuda_data, - task_begin, task_end, P_submat, VXC_submat, &EXC_tmp, &NEL_tmp - ); - - // Update full quantities - *EXC += EXC_tmp; - *NEL += NEL_tmp; - timer.time_op_accumulate("XCIntegrator.IncrementSubPotential",[&]() { - detail::inc_by_submat( basis.nbf(), basis.nbf(), nbe, nbe, VXC, basis.nbf(), - VXC_submat, nbe, union_submat_cut ); - }); - - - // Reset shell_list to be wrt full basis - timer.time_op_accumulate("XCIntegrator.ResetShellList",[&]() { - for( auto _it = task_begin; _it != task_end; ++_it ) - for( auto j = 0; j < _it->shell_list.size(); ++j ) { - _it->shell_list[j] = union_shell_list[_it->shell_list[j]]; - } - }); - -} - - - - - -template -void process_batches_cuda_replicated_density_shellbatched_p( - util::Timer& timer, - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCCudaData & cuda_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* EXC, - F* NEL -) { - - const uint32_t nbf_threshold = 8000; - std::cout << "IN SHELL BATCHED\n" << std::flush; - std::cout << "TOTAL NTASKS = " << std::distance( local_work_begin, local_work_end ) << std:: endl; - std::cout << "TOTAL NBF = " << basis.nbf() << std::endl; - std::cout << "NBF THRESH = " << nbf_threshold << std::endl; - - - // Zero out final results - timer.time_op( "XCIntegrator.ZeroHost", [&]() { - *EXC = 0.; - *NEL = 0.; - std::memset( VXC, 0, basis.nbf()*basis.nbf()*sizeof(F) ); - }); - -#if 0 - size_t nbf = basis.nbf(); - size_t nshells = basis.nshells(); - size_t natoms = mol.size(); - - // Allocate static quantities on device stack - cuda_data.allocate_static_data( natoms, n_deriv, nbf, nshells ); - - process_batches_cuda_replicated_density_incore_p( - weight_alg, func, basis, mol, meta, cuda_data, - local_work_begin, local_work_end, P, VXC, EXC, NEL - ); -#else - - auto nbe_comparator = []( const auto& task_a, const auto& task_b ) { - return task_a.nbe < task_b.nbe; - }; - - - size_t batch_iter = 0; - auto task_begin = local_work_begin; - - const size_t natoms = mol.size(); - - //std::future device_ex; - - std::cout << "MASTER THREAD ID = " << std::this_thread::get_id() << std::endl; - std::queue< dev_ex_task > dev_tasks; - - auto execute_device_task = [&] () { - - if( dev_tasks.empty() ) return; - - std::cout << "Executing device tasks on thread " << std::this_thread::get_id() << std::endl; - - dev_ex_task batch_task = std::move( dev_tasks.front() ); // Move task to local scope - dev_tasks.pop(); // Remove from queue - - // Execute task - timer.time_op_accumulate( "XCIntegrator.DeviceWork", [&]() { - device_execute_shellbatched( timer, weight_alg, func, basis, mol, - meta, cuda_data, P, VXC, EXC, NEL, - batch_task ); - }); - - - }; - - std::future dev_future; - while( task_begin != local_work_end ) { - - // Generate task - dev_tasks.emplace( generate_dev_batch( nbf_threshold, task_begin, - local_work_end, basis, timer ) ); - - if( not dev_future.valid() ) { - dev_future = std::async( std::launch::async, execute_device_task ); - } else { - auto status = dev_future.wait_for( std::chrono::milliseconds(5) ); - if( status == std::future_status::ready ) { - dev_future.get(); - dev_future = std::async( std::launch::async, execute_device_task ); - } - } - - // Update task iterator for next set of batches - task_begin = dev_tasks.back().task_end; - - } - - - if( dev_future.valid() ) dev_future.wait(); - - // TODO: Try to merge tasks if possible - //for( auto _task_it = dev_tasks.begin(); _task_it != dev_tasks.end()-1; ++_task_it ) { - // const auto& shell_list = _task_it->union_shell_list; - // auto task_nbe = basis.nbf_subset( shell_list.begin(), shell_list.end() ); - // auto _merge_it = _task_it + 1; - // while( task_nbe <= nbf_threshold and _merge_it != dev_tasks.end() ) { - // _merge_it = std::find_if( _merge_it, dev_tasks.end(), [&]( const auto& t ) { - // const auto& local_shell_list - // } ); - // } - //} - - while( not dev_tasks.empty() ) { - // Execute remaining tasks - execute_device_task(); - } - - - -#endif - -} - - -#define CUDA_IMPL( F, ND ) \ -template \ -void process_batches_cuda_replicated_density_shellbatched_p(\ - util::Timer& timer,\ - XCWeightAlg weight_alg,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCCudaData & cuda_data,\ - host_task_iterator local_work_begin,\ - host_task_iterator local_work_end,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -CUDA_IMPL( double, 0 ); -CUDA_IMPL( double, 1 ); - -} -} - diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_eval_denvars.cu b/third_party/gauxc/attic/src/integrator/cuda/cuda_eval_denvars.cu deleted file mode 100644 index 0f1cd2f..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_eval_denvars.cu +++ /dev/null @@ -1,254 +0,0 @@ -#include "cuda/cuda_eval_denvars.hpp" -#include "cuda/cuda_extensions.hpp" -#include - -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -__global__ void eval_uvars_lda_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.nbe; - - auto* den_eval_device = task.den; - - const auto* basis_eval_device = task.bf; - - const auto* den_basis_prod_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - register double den_reg = 0.; - - if( tid_x < nbf and tid_y < npts ) { - - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; - - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - - } - - // Warp blocks are stored col major - den_reg = 2 * warpReduceSum( den_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - } - - -} - - - -#define GGA_KERNEL_SM_BLOCK_Y 32 - -template -__global__ void eval_uvars_gga_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.nbe; - - auto* den_eval_device = task.den; - auto* den_x_eval_device = task.ddenx; - auto* den_y_eval_device = task.ddeny; - auto* den_z_eval_device = task.ddenz; - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - const auto* den_basis_prod_device = task.zmat; - - __shared__ double den_shared[4][warp_size][GGA_KERNEL_SM_BLOCK_Y+1]; - - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; - bid_y < npts; - bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - den_shared[3][threadIdx.x][sm_y] = 0.; - - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; - - den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - __syncthreads(); - - - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - register double den_reg = den_shared[0][sm_y][threadIdx.x]; - register double dx_reg = den_shared[1][sm_y][threadIdx.x]; - register double dy_reg = den_shared[2][sm_y][threadIdx.x]; - register double dz_reg = den_shared[3][sm_y][threadIdx.x]; - - // Warp blocks are stored col major - den_reg = 2 * warpReduceSum( den_reg ); - dx_reg = 4 * warpReduceSum( dx_reg ); - dy_reg = 4 * warpReduceSum( dy_reg ); - dz_reg = 4 * warpReduceSum( dz_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - atomicAdd( den_x_eval_device + tid_y, dx_reg ); - atomicAdd( den_y_eval_device + tid_y, dy_reg ); - atomicAdd( den_z_eval_device + tid_y, dz_reg ); - } - } - __syncthreads(); - } - } -} - - -template -__global__ void eval_vvars_gga_kernel( - size_t npts, - const T* den_x_eval_device, - const T* den_y_eval_device, - const T* den_z_eval_device, - T* gamma_eval_device -) { - - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - if( tid < npts ) { - - const double dx = den_x_eval_device[ tid ]; - const double dy = den_y_eval_device[ tid ]; - const double dz = den_z_eval_device[ tid ]; - - gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz; - - } - -} - - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( max_nbf , threads.x ), - util::div_ceil( max_npts , threads.y ), - ntasks ); - - eval_uvars_lda_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - dim3 threads( warp_size, max_warps_per_thread_block / 2, 1 ); - dim3 blocks( std::min(int64_t(4), util::div_ceil( max_nbf, 4 )), - std::min(int64_t(16), util::div_ceil( max_nbf, 16 )), - ntasks ); - - eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} - - -template -void eval_vvars_gga_device( size_t npts, - const T* den_x_device, - const T* den_y_device, - const T* den_z_device, - T* gamma_device, - cudaStream_t stream ) { - - dim3 threads( max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts, threads.x ) ); - - eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>( - npts, den_x_device, den_y_device, den_z_device, gamma_device - ); - -} - - - - - - - - - - - - - - - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void eval_vvars_gga_device( size_t npts, - const double* den_x_device, - const double* den_y_device, - const double* den_z_device, - double* gamma_device, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_eval_denvars.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_eval_denvars.hpp deleted file mode 100644 index e08874f..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_eval_denvars.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - - -template -void eval_vvars_gga_device( size_t npts, - const T* den_x_device, - const T* den_y_device, - const T* den_z_device, - T* gamma_device, - cudaStream_t stream ); - - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_extensions.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_extensions.hpp deleted file mode 100644 index 987c620..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_extensions.hpp +++ /dev/null @@ -1,109 +0,0 @@ -#pragma once -#include -#include -#include "cuda/cuda_device_properties.hpp" - -#define GAUXC_ENABLE_WARP_REDUCTIONS - -namespace GauXC { -namespace cuda { - -__inline__ __device__ -double warpReduceSum(double val) { - -#ifdef GAUXC_ENABLE_WARP_REDUCTIONS - - for(int i=(warp_size/2); i>=1; i/=2) - val += __shfl_xor_sync(0xffffffff, val, i, warp_size); - -#else - - using warp_reducer = cub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage temp_storage[max_warps_per_thread_block]; - int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - int warp_lane = tid / warp_size; - val = warp_reducer( temp_storage[warp_lane] ).Sum( val ); - -#endif - - return val; -} - -__inline__ __device__ -double warpReduceProd(double val) { - for(int i=(warp_size/2); i>=1; i/=2) - val *= __shfl_xor_sync(0xffffffff, val, i, warp_size); - return val; -} - -#if 0 -__inline__ __device__ -double blockReduceSum( double val ) { - - static __shared__ double shared[32]; - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; - - val = warpReduceSum( val ); - - if( lane == 0 ) shared[wid] = val; - - __syncthreads(); - - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0; - if( wid == 0 ) val = warpReduceSum( val ); - - return val; - -} - -template -__inline__ __device__ T warp_prod_reduce( T val ) { - - for( int i = warp_size / 2; i >= 1; i /= 2 ) - val *= __shfl_xor_sync( 0xffffffff, val, i, warp_size ); - - return val; - -} - -template -__inline__ __device__ T block_prod_reduce( T val ) { - - static __shared__ T shared[32]; - const int lane = threadIdx.x % 32; - const int wid = threadIdx.x / 32; - - val = warp_prod_reduce( val ); - - if( lane == 0 ) shared[ wid ] = val; - __syncthreads(); - - val = ( threadIdx.x < blockDim.x / 32 ) ? shared[ lane ] : 0; - if( wid == 0 ) val = warp_prod_reduce( val ); - - return val; - -} - -__inline__ __device__ double atomicMul(double* address, double val) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val * - __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_inc_potential.cu b/third_party/gauxc/attic/src/integrator/cuda/cuda_inc_potential.cu deleted file mode 100644 index b6b1d7c..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_inc_potential.cu +++ /dev/null @@ -1,167 +0,0 @@ -#include "cuda/cuda_inc_potential.hpp" -#include "cuda/cuda_device_properties.hpp" -#include - -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - - -#define WARP_X 16 -#define WARP_Y 1 -#define UNROLL_FACTOR 4 -#define EFF_UNROLL 4 -#define CUT_X 8 -#define CUT_Y 8 - - -template -__global__ __launch_bounds__(1024, 1) -void inc_by_submat_combined_kernel( size_t ntasks, - XCTaskDevice* device_tasks, - T* A, - size_t LDA, - const int block_y, - const int block_x ) { - - const int batch_id = blockIdx.z; - auto& task = device_tasks[ batch_id ]; - - const auto* submat_cut_device = task.submat_cut; - const auto* submat_block_device = task.submat_block; - const auto LDAS = task.nbe; - auto* ASmall_device = task.nbe_scr; - - //if( LDAS == LDAB ) return; - const int tid_xx = threadIdx.x % WARP_X; - const int tid_xy = threadIdx.x / WARP_X; - - const int tid_yx = threadIdx.y % CUT_X; - const int tid_yy = threadIdx.y / CUT_X; - - const int start_cut_y = submat_block_device[block_y]; - const int end_cut_y = submat_block_device[block_y+1]; - const int start_cut_x = submat_block_device[block_x]; - const int end_cut_x = submat_block_device[block_x+1]; - - for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) { - const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut)); - const int i_cut_first = i_data.x; - const int delta_i = i_data.y; - const int i_cut_small = i_data.z; - - for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) { - const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut)); - const int j_cut_first = j_data.x; - const int delta_j = j_data.y; - const int j_cut_small = j_data.z; - - auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS; - auto* ABig_begin = A + i_cut_first + j_cut_first*LDA; - - int J; - for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - - double val[UNROLL_FACTOR]; - double* address[UNROLL_FACTOR]; -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - val[k] = ASmall_begin[I + (J+k*WARP_Y)*LDAS]; - address[k] = ABig_begin + I + (J+k*WARP_Y)*LDA; - } -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - atomicAdd(address[k], val[k] ); - } - } - } - - for ( ; J < delta_j; J += WARP_Y) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - atomicAdd(ABig_begin + I + J*LDA, ASmall_begin[I + J*LDAS] ); - } - } - - } - } -} - - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - T* V_device, - size_t LDV, - cudaStream_t stream ) { - dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks); - - const int submat_block_size = get_submat_cut_block(LDV, 0); - for (int i = 0; i < util::div_ceil(LDV, submat_block_size); i++) { - for (int j = 0; j < util::div_ceil(LDV, submat_block_size); j++) { - inc_by_submat_combined_kernel<<< blocks, threads, 0, stream >>>( - ntasks, device_tasks, V_device, LDV, i, j - ); - } - } -} - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - double* V_device, - size_t LDV, - cudaStream_t stream ); - -template -__global__ void symmetrize_matrix_device( size_t nbf, size_t LDA, T* A ) { - const size_t block_size = warp_size; - - __shared__ T buffer[block_size][block_size+1]; // Pad shared memory to resolve shared memory - - const size_t num_blocks = ((nbf + block_size - 1) / block_size); - - for (int i = blockIdx.x; i < num_blocks; i += gridDim.x) { - // TODO This could be load balanced if need be - const int i_coord = i * block_size; - for (int j = i; j < num_blocks; j++) { - const int j_coord = j * block_size; - - // Read in block to buffer - // TODO These could be vector reads/writes if this becomes significant - if (i_coord + threadIdx.y < nbf && j_coord + threadIdx.x < nbf) { - buffer[threadIdx.y][threadIdx.x] = A[(i_coord + threadIdx.y) * LDA + j_coord + threadIdx.x]; - } - __syncthreads(); - - // Write buffer - if (j_coord + threadIdx.y < nbf && i_coord + threadIdx.x < nbf) { - if ((j_coord != i_coord || threadIdx.x < threadIdx.y)) { // handles the diagonal block - A[(j_coord + threadIdx.y) * LDA + i_coord + threadIdx.x] = buffer[threadIdx.x][threadIdx.y]; - } - } - __syncthreads(); - } - } -} - -template -void symmetrize_matrix( size_t nbf, size_t LDV, T* V_device, cudaStream_t stream) { - const size_t num_blocks = ((LDV + warp_size - 1) / warp_size); - // Warp size must equal max_warps_per_thread_block must equal 32 - dim3 threads(warp_size, max_warps_per_thread_block), blocks(num_blocks); - symmetrize_matrix_device<<>>(nbf, LDV, V_device); -} - -template -void symmetrize_matrix( size_t nbf, size_t LDV, double* V_device, cudaStream_t stream ); - - -} -} -} - diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_inc_potential.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_inc_potential.hpp deleted file mode 100644 index 16070cb..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_inc_potential.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - T* V_device, - size_t LDV, - cudaStream_t stream ); - -template -void symmetrize_matrix( size_t nbf, - size_t LDV, - T* V_device, - cudaStream_t stream); - -} -} -} - diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_pack_density.cu b/third_party/gauxc/attic/src/integrator/cuda/cuda_pack_density.cu deleted file mode 100644 index 941a04a..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_pack_density.cu +++ /dev/null @@ -1,127 +0,0 @@ -#include "cuda/cuda_pack_density.hpp" -#include "cuda/cuda_device_properties.hpp" -#include - -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -#define WARP_X 16 -#define WARP_Y 1 -#define UNROLL_FACTOR 4 -#define EFF_UNROLL 4 -#define CUT_X 8 -#define CUT_Y 8 - -template -__global__ __launch_bounds__(1024, 1) -void submat_set_combined_kernel( size_t ntasks, - XCTaskDevice* device_tasks, - T* A, - size_t LDA, - const int block_y, - const int block_x) { - - - const int batch_id = blockIdx.z; - auto& task = device_tasks[ batch_id ]; - - const auto* submat_cut_device = task.submat_cut; - const auto* submat_block_device = task.submat_block; - const auto LDAS = task.nbe; - auto* ASmall_device = task.nbe_scr; - - //if( LDAS == LDAB ) return; - - const int tid_xx = threadIdx.x % WARP_X; - const int tid_xy = threadIdx.x / WARP_X; - - const int tid_yx = threadIdx.y % CUT_X; - const int tid_yy = threadIdx.y / CUT_X; - - const int start_cut_y = submat_block_device[block_y]; - const int end_cut_y = submat_block_device[block_y+1]; - const int start_cut_x = submat_block_device[block_x]; - const int end_cut_x = submat_block_device[block_x+1]; - - for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) { - const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut)); - const int i_cut_first = i_data.x; - const int delta_i = i_data.y; - const int i_cut_small = i_data.z; - - for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) { - const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut)); - const int j_cut_first = j_data.x; - const int delta_j = j_data.y; - const int j_cut_small = j_data.z; - - auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS; - auto* ABig_begin = A + i_cut_first + j_cut_first*LDA; - - int J; - for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - - double val[UNROLL_FACTOR]; - double* address[UNROLL_FACTOR]; -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - val[k] = ABig_begin[I + (J + k*WARP_Y)*LDA]; - address[k] = ASmall_begin + I + (J + k*WARP_Y) * LDAS; - } -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - // Suggest that the result be evicted first. -#if (CUDART_VERSION >= 11000) - __stcs(address[k], val[k]); -#else - asm ("st.global.cs.f64 [%0], %1;" :: "l"(address[k]), "d"(val[k])); -#endif - } - } - } - - for ( ; J < delta_j; J += WARP_Y) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA]; - } - } - } - } -} - - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - T* P_device, - size_t LDP, - cudaStream_t stream ) { - - dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks); - - const int submat_block_size = get_submat_cut_block(LDP, 0); - for (int i = 0; i < util::div_ceil(LDP, submat_block_size); i++) { - for (int j = 0; j < util::div_ceil(LDP, submat_block_size); j++) { - submat_set_combined_kernel<<< blocks, threads, 0, stream >>>( - ntasks, device_tasks, P_device, LDP, i, j - ); - } - } -} - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - double* P_device, - size_t LDP, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_pack_density.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_pack_density.hpp deleted file mode 100644 index ae90ef3..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_pack_density.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - T* P_device, - size_t LDP, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_weights.cu b/third_party/gauxc/attic/src/integrator/cuda/cuda_weights.cu deleted file mode 100644 index 3013324..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_weights.cu +++ /dev/null @@ -1,641 +0,0 @@ -#include - -#include "cuda/cuda_weights.hpp" -#include "integrator_constants.hpp" -#include "cuda/cuda_extensions.hpp" -#include "cuda/cuda_device_properties.hpp" - -constexpr double eps_d = std::numeric_limits::epsilon(); - - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -__global__ void reciprocal_kernel(size_t length, double* vec) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < length; i += blockDim.x * gridDim.x) { - vec[i] = 1. / vec[i]; - } -} - -__global__ void compute_point_center_dist( - size_t npts, - size_t LDatoms, - size_t natoms, - const double* coords, - const double* points, - double* dist -) { - - __shared__ double3 point_buffer[warp_size]; - register double3 coord_reg; - - const int natoms_block = (natoms + warp_size-1) / warp_size; - const int coords_block = (npts + warp_size-1) / warp_size; - - const double3* coords_vec = (double3*) coords; - const double3* points_vec = (double3*) points; - - for (int j = blockIdx.x; j < natoms_block; j += gridDim.x) { - const int iAtom = j * warp_size + threadIdx.x; - // Load blocks into registers/shared memory - if (iAtom < natoms) { - coord_reg = coords_vec[iAtom]; - } - for (int i = blockIdx.y; i < coords_block; i += gridDim.y) { - const int iPt_load = i * warp_size + threadIdx.x; - if (iPt_load < npts) { - point_buffer[threadIdx.x] = points_vec[iPt_load]; - } - __syncthreads(); - - // do the computation - #pragma unroll 2 - for (int k = threadIdx.y; k < warp_size; k += blockDim.y) { - const int iPt_sm = k; - const int iPt = i * warp_size + iPt_sm; - const double rx = point_buffer[iPt_sm].x - coord_reg.x; - const double ry = point_buffer[iPt_sm].y - coord_reg.y; - const double rz = point_buffer[iPt_sm].z - coord_reg.z; - - if (iAtom < natoms and iPt < npts) { - dist[ iAtom + iPt * LDatoms ] = std::sqrt( rx*rx + ry*ry + rz*rz ); - } - } - __syncthreads(); - } - } -} - -#if 0 -__global__ void modify_weights_becke_kernel( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - double* weights_device -) { - - // Becke partition functions - auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 - auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 - - - __shared__ double shared[2048]; - for( int ipt = blockIdx.x; ipt < npts; ipt += gridDim.x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - for( int iCenter = threadIdx.y; iCenter < natoms; iCenter += blockDim.y ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = threadIdx.x; jCenter < natoms; jCenter += blockDim.x ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - const double s = 0.5 * ( 1. - gBecke( mu ) ); - - ps *= (iCenter == jCenter) ? 1. : s ; - - } - - ps = warp_prod_reduce( ps ); // XXX: Assumes blockDim.x == 32 - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } - - // XXX: Assumes blockDim.x == blockDim.y == 32 - if( threadIdx.x == 0 ) { - shared[ threadIdx.y ] = sum; - shared[ threadIdx.y + 1024] = parent_weight; - } - - __syncthreads(); - sum = shared[ threadIdx.x ]; - sum = warpReduceSum( sum ); - - __syncthreads(); - parent_weight = shared[ threadIdx.x + 1024]; - parent_weight = __shfl_sync(0xffffffff, parent_weight, iParent % 32, 32 ); - - if( threadIdx.x == 0 and threadIdx.y == 0 ) - weights_device[ipt] *= parent_weight / sum; - - - } - - -} - - - -__global__ void modify_weights_ssf_kernel( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - - // Frisch partition functions - auto gFrisch = [](double x) { - - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - - auto sFrisch = [&] (double x) { - const double g = 0.5 * (1. - gFrisch(x)); - return (x >= magic_ssf_factor<>) ? 0. : (x <= -magic_ssf_factor<>) ? 1. : g; - }; - - constexpr double weight_tol = 1e-10; - - __shared__ double shared[2048]; - for( int ipt = blockIdx.x; ipt < npts; ipt += gridDim.x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - - for( int iCenter = threadIdx.y; iCenter < natoms; iCenter += blockDim.y ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = threadIdx.x; jCenter < natoms; jCenter += blockDim.x ) - if( fabs(ps) > weight_tol ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - const double s = sFrisch( mu ); - ps *= (iCenter == jCenter) ? 1. : s ; - - } - - ps = warp_prod_reduce( ps ); // XXX: Assumes blockDim.x == 32 - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } - - // XXX: Assumes blockDim.x == blockDim.y == 32 - if( threadIdx.x == 0 ) { - shared[ threadIdx.y ] = sum; - shared[ threadIdx.y + 1024] = parent_weight; - } - - __syncthreads(); - sum = shared[ threadIdx.x ]; - sum = warpReduceSum( sum ); - - __syncthreads(); - parent_weight = shared[ threadIdx.x + 1024]; - parent_weight = __shfl_sync(0xffffffff, parent_weight, iParent % 32, 32 ); - - if( threadIdx.x == 0 and threadIdx.y == 0 ) - weights_device[ipt] *= parent_weight / sum; - - - } - - -} -#endif - -// SIMT over points: 1D kernel -__global__ void modify_weights_ssf_kernel_1d( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - - // Frisch partition functions - auto gFrisch = [](double x) { - - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - -#if 0 - auto sFrisch = [&] (double x) { - const double g = 0.5 * (1. - gFrisch(x)); - return (x >= magic_ssf_factor<>) ? 0. : (x <= -magic_ssf_factor<>) ? 1. : g; - }; -#else - auto sFrisch = [&] (double x) { - if( fabs(x) < magic_ssf_factor<> ) return 0.5 * (1. - gFrisch(x)); - else if( x >= magic_ssf_factor<> ) return 0.; - else return 1.; - }; -#endif - - constexpr double weight_tol = 1e-10; - - const int tid_x = threadIdx.x + blockIdx.x * blockDim.x; - const int nt_x = blockDim.x * gridDim.x; - - //__shared__ double shared[2048]; - for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - -#if 0 - for( int iCenter = 0; iCenter < natoms; iCenter++ ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( fabs(ps) > weight_tol ) { - if( iCenter != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - ps *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } -#else - - // Do iParent First - { - - const double ri = local_dist_scratch[ iParent ]; - const double* const local_rab = RAB + iParent * natoms; - - parent_weight = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( parent_weight > weight_tol ) { - if( iParent != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - parent_weight *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - sum += parent_weight; - - } - - if( parent_weight < eps_d ) { - weights_device[ipt] = 0.; - continue; - } - - for( int iCenter = 0; iCenter < natoms; iCenter++ ) - if( iParent != iCenter ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( ps > weight_tol ) { - if( iCenter != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - ps *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - sum += ps; - - } - -#endif - - weights_device[ipt] *= parent_weight / sum; - - - } - - -} - -__device__ __inline__ double gFrisch(double x) { - // Frisch partition functions -// const double s_x = x / magic_ssf_factor<>; - const double s_x = x * 1.5625; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return ((35.) *(s_x - s_x3) + (21.) *s_x5 - (5.) *s_x7); -} - - -__device__ __inline__ double sFrisch(double x) { - //double frisch_val = (0.5 - (0.5/ 16.0) * gFrisch(x)); - - if( fabs(x) < magic_ssf_factor<> ) return (0.5 - (0.5/ 16.0) * gFrisch(x)); - else if( x >= magic_ssf_factor<> ) return 0.; - else return 1.; -} - -__global__ __launch_bounds__(weight_thread_block, weight_thread_block_per_sm) -void modify_weights_ssf_kernel_2d( - size_t npts, - size_t LDatoms, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - constexpr double weight_tol = 1e-10; - int natom_block = ((natoms + blockDim.x - 1) / blockDim.x) * blockDim.x; - - const int tid_x = threadIdx.y + blockIdx.y * blockDim.y; - const int nt_x = blockDim.y * gridDim.y; - - __shared__ int jCounter_sm[max_warps_per_thread_block]; - int* jCounter = reinterpret_cast(jCounter_sm) + threadIdx.y; - - // Each warp will work together on a point - for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * LDatoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - - // Do iParent First - { - - const double ri = local_dist_scratch[ iParent ]; - const double* const local_rab = RAB + iParent * LDatoms; - - parent_weight = 1.; - for( int jCenter = threadIdx.x; jCenter < natom_block; jCenter+=blockDim.x ) { - double contribution = 1.0; - if (jCenter < natoms && iParent != jCenter) { - const double rj = local_dist_scratch[ jCenter ]; - const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric - contribution = sFrisch( mu ); - } - contribution = warpReduceProd(contribution); - parent_weight *= contribution; - - if (parent_weight < weight_tol) break; - } - } - - if( parent_weight < eps_d ) { - if (threadIdx.x == 0) - weights_device[ipt] = 0.; - __syncwarp(); - continue; - } - - // Initialize each counter to 0 - if (threadIdx.x == 0) { - jCounter[0] = 0; - } - __syncwarp(); - - // Each thread will process an iCenter. Atomic operations are used to assign - // an iCenter value to each thread. - int iCenter = atomicAdd(jCounter, 1); - if (iCenter >= iParent) iCenter++; // iCenter == iParent is skipped - - // The entire warp processes the same jCenter value at the same time - int jCenter = 0; - - const double* local_rab = RAB + iCenter * LDatoms; - double ri = local_dist_scratch[ iCenter ]; - double ps = 1.; - int iCount = 0; - int cont = (iCenter < natoms); - - // We will continue iterating until all of the threads have cont set to 0 - while (__any_sync(0xffffffff, cont)) { - if (cont) { - double2 rj[weight_unroll/2]; - double2 rab_val[weight_unroll/2]; - double mu[weight_unroll]; - iCount += weight_unroll; - - #pragma unroll - for (int k = 0; k < weight_unroll/2; k++) { - rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); - rab_val[k] = *((double2*)(local_rab + jCenter) + k); - } - - #pragma unroll - for (int k = 0; k < weight_unroll/2; k++) { - mu[2*k+0] = (ri - rj[k].x) * rab_val[k].x; // XXX: RAB is symmetric - mu[2*k+1] = (ri - rj[k].y) * rab_val[k].y; - } - - #pragma unroll - for (int k = 0; k < weight_unroll; k++) { - if((iCenter != jCenter + k) && (jCenter + k < natoms)) { - mu[k] = sFrisch( mu[k] ); - ps *= mu[k]; - } - } - - // A thread is done with a iCenter based on 2 conditions. Weight tolerance - // Or if it has seen all of the jCenters - if( !(ps > weight_tol && iCount < LDatoms )) { - // In the case were the thread is done, it begins processing another iCenter - sum += ps; - iCenter = atomicAdd(jCounter, 1); - if (iCenter >= iParent) iCenter++; - - // If there are no more iCenters left to process, it signals it is ready to exit - cont = (iCenter < natoms); - ri = local_dist_scratch[ iCenter ]; - local_rab = RAB + iCenter * LDatoms; - ps = 1.; - iCount = 0; - } - } - // Wraps jCenter around. This was faster than modulo - jCenter += weight_unroll; - jCenter = (jCenter < LDatoms) ? jCenter : 0; - } - - // All of the threads then sum their contributions. Only thread 0 needs to add the parent - // contribution. - __syncwarp(); - sum = warpReduceSum(sum); - if (threadIdx.x == 0) { - sum += parent_weight; - weights_device[ipt] *= parent_weight / sum; - } - - __syncwarp(); - - } -} - - -void cuda_reciprocal(size_t length, double* vec, cudaStream_t stream) { - dim3 threads(max_threads_per_thread_block); - dim3 blocks( get_device_sm_count(0) ); - reciprocal_kernel<<>>(length, vec); -} - - -template -void partition_weights_cuda_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const F* points_device, - const int32_t* iparent_device, - const F* dist_nearest_device, - const F* rab_device, - const F* atomic_coords_device, - F* weights_device, - F* dist_scratch_device, - cudaStream_t stream ) { - - - - // Evaluate point-to-atom collocation - { - const int distance_thread_y = max_warps_per_thread_block / 2; - dim3 threads( warp_size, distance_thread_y ); - dim3 blocks( util::div_ceil( natoms, threads.x), - util::div_ceil( npts, 4) ); - - compute_point_center_dist<<< blocks, threads, 0, stream>>>( - npts, LDatoms, natoms, atomic_coords_device, points_device, dist_scratch_device - ); - - } - const bool partition_weights_1d_kernel = true; - - if( partition_weights_1d_kernel ) { - - dim3 threads( warp_size, weight_thread_block / warp_size ); - dim3 blocks( 1, get_device_sm_count(0) * weight_thread_block_per_sm); - modify_weights_ssf_kernel_2d<<< blocks, threads, 0, stream >>>( - npts, LDatoms, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, dist_nearest_device, weights_device - ); - - } else { - -#if 0 - dim3 threads( 32, 32 ); - dim3 blocks ( npts, 1 ); - - if( weight_alg == XCWeightAlg::SSF ) - modify_weights_ssf_kernel<<< blocks, threads, 0, stream >>>( - npts, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, dist_nearest_device, weights_device - ); - else - modify_weights_becke_kernel<<< blocks, threads, 0, stream >>>( - npts, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, weights_device - ); -#endif - - } - - -} - -template -void partition_weights_cuda_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const double* points_device, - const int32_t* iparent_device, - const double* dist_nearest_device, - const double* rab_device, - const double* atomic_coords_device, - double* weights_device, - double* dist_scratch_device, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_weights.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_weights.hpp deleted file mode 100644 index 2418cfc..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_weights.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include -#include -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - - -void cuda_reciprocal(size_t length, double* vec, cudaStream_t stream); - -template -void partition_weights_cuda_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const F* points_device, - const int32_t* iparent_device, - const F* dist_nearest_device, - const F* rab_device, - const F* atomic_coords_device, - F* weights_device, - F* dist_scratch_device, - cudaStream_t stream ); - - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_zmat.cu b/third_party/gauxc/attic/src/integrator/cuda/cuda_zmat.cu deleted file mode 100644 index 18a8b41..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_zmat.cu +++ /dev/null @@ -1,140 +0,0 @@ -#include "cuda/cuda_zmat.hpp" -#include -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - - -template -__global__ void zmat_lda_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - const auto nbf = task.nbe; - const auto* vrho_device = task.vrho; - - const auto* basis_eval_device = task.bf; - - auto* z_matrix_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nbf ) { - - const size_t ibfoff = tid_y * npts + tid_x; - const double fact = 0.5 * vrho_device[tid_x]; - - z_matrix_device[ ibfoff ] = fact * basis_eval_device[ ibfoff ]; - - } - -} - - - - -template -void zmat_lda_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - - dim3 threads(warp_size,max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - - zmat_lda_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} - -template -void zmat_lda_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - - - - -template -__global__ void zmat_gga_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - const auto nbf = task.nbe; - const auto* vrho_device = task.vrho; - const auto* vgamma_device = task.vgamma; - const auto* den_x_eval_device = task.ddenx; - const auto* den_y_eval_device = task.ddeny; - const auto* den_z_eval_device = task.ddenz; - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - auto* z_matrix_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nbf ) { - - const size_t ibfoff = tid_y * npts + tid_x; - const double fact_1 = 0.5 * vrho_device[tid_x] ; - const double fact_2 = 2.0 * vgamma_device[tid_x]; - - const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ]; - const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ]; - const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ]; - - z_matrix_device[ ibfoff ] = - fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz ); - - } -} - -template -void zmat_gga_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - - dim3 threads(warp_size,max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - - zmat_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} -template -void zmat_gga_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/integrator/cuda/cuda_zmat.hpp b/third_party/gauxc/attic/src/integrator/cuda/cuda_zmat.hpp deleted file mode 100644 index 58769d8..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/cuda_zmat.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void zmat_lda_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void zmat_gga_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/integrator/cuda/gauxc-cuda_integrator.cmake b/third_party/gauxc/attic/src/integrator/cuda/gauxc-cuda_integrator.cmake deleted file mode 100644 index 864739b..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/gauxc-cuda_integrator.cmake +++ /dev/null @@ -1,75 +0,0 @@ -# Check if CMAKE_CUDA_ARCHITECTURES is set -if( NOT DEFINED CMAKE_CUDA_ARCHITECTURES ) - message( FATAL_ERROR "CMAKE_CUDA_ARCHITECTURES Must Be Set" ) -endif() - -# Check that only CUDA CC 6.0+ is enabled -foreach( cuda_arch ${CMAKE_CUDA_ARCHITECTURES} ) - if( cuda_arch LESS 60 ) - message(FATAL_ERROR "GauXC Requires CUDA CC 6.0+ For FP64 Atomics") - endif() -endforeach() - - - -if( NOT TARGET CUDA::cublas ) - find_package( CUDAToolkit REQUIRED ) -endif() -include( gauxc-cub ) - - - -target_sources( gauxc PRIVATE cuda/collocation_device.cu - cuda/xc_cuda_data.cxx - cuda/cuda_driver_replicated_density_incore.cxx - cuda/cuda_driver_replicated_density_shellbatched.cxx - cuda/cuda_weights.cu - cuda/cuda_pack_density.cu - cuda/cuda_eval_denvars.cu - cuda/cublas_extensions.cu - cuda/cuda_zmat.cu - cuda/cuda_inc_potential.cu - cuda/cuda_device_properties.cxx -) - -target_compile_features( gauxc PRIVATE cuda_std_14 ) -#target_include_directories( gauxc -# PRIVATE -# $ -# $ -#) - -target_compile_options( gauxc - PRIVATE - $<$: -Xcudafe --diag_suppress=partial_override -Xptxas -v > -) - -if( GAUXC_ENABLE_NCCL ) - - message( STATUS "NCCL Has Been Enabled" ) - find_package( NCCL REQUIRED ) - target_link_libraries( gauxc PUBLIC NCCL::nccl ) - -endif() - -if( GAUXC_ENABLE_MAGMA ) - - message( STATUS "MAGMA Has Been Enabled" ) - find_package( MAGMA REQUIRED ) - target_link_libraries( gauxc PUBLIC MAGMA::magma ) - -else() - - message( STATUS "MAGMA Has Been Explicitly Disabled" ) - -endif() - -if(NOT GAUXC_LINK_CUDA_STATIC) - target_link_libraries( gauxc PUBLIC CUDA::cublas ) -else() - target_link_libraries( gauxc PUBLIC CUDA::cublas_static ) -endif() - -if( TARGET gauxc_cub ) # Handle the case when CUB is implicit - target_link_libraries( gauxc PRIVATE $ ) -endif() diff --git a/third_party/gauxc/attic/src/integrator/cuda/xc_cuda_data.cxx b/third_party/gauxc/attic/src/integrator/cuda/xc_cuda_data.cxx deleted file mode 100644 index 59827a1..0000000 --- a/third_party/gauxc/attic/src/integrator/cuda/xc_cuda_data.cxx +++ /dev/null @@ -1,535 +0,0 @@ -#include -#include - -#include "cuda/buffer_adaptor.hpp" -#include "integrator_common.hpp" -#include "cuda/cuda_device_properties.hpp" - -namespace GauXC { - -template -XCCudaData::XCCudaData( bool _batch_l3_blas ): -#ifdef GAUXC_ENABLE_MAGMA - batch_l3_blas(_batch_l3_blas) -#else - batch_l3_blas(false) -#endif -{ - - - // TODO: Expose this - double fill_fraction = 0.9; - - cudaError_t stat; - - // Get Total Available Memory - size_t cuda_avail, cuda_total; - stat = cudaMemGetInfo( &cuda_avail, &cuda_total ); - GAUXC_CUDA_ERROR( "MemInfo Failed", stat ); - - // Allocate up to fill_fraction - devmem_sz = fill_fraction * cuda_avail; - stat = cudaMalloc( &device_ptr, devmem_sz ); - GAUXC_CUDA_ERROR( "CUDA Malloc Failed", stat ); - - // Create CUDA Stream and CUBLAS Handles and make them talk to eachother - master_stream = std::make_unique< util::cuda_stream >(); - master_handle = std::make_unique< util::cublas_handle >(); - - cublasSetStream( *master_handle, *master_stream ); - -#ifdef GAUXC_ENABLE_MAGMA - // Create MAGMA Queue from CUDA Stream and CUBLAS Handle - master_magma_queue = - std::make_unique< util::magma_queue >( 0, *master_stream, *master_handle ); -#endif - - if( not batch_l3_blas ) { - - // Create BLAS streams - blas_streams.resize(4); - blas_handles.resize(4); - for( auto i = 0; i < 4; ++i ) - cublasSetStream( blas_handles[i], blas_streams[i] ); - - } - -} - - - -template -XCCudaData::~XCCudaData() noexcept { - if( device_ptr ) util::cuda_free( device_ptr ); -} - - - - - - - -template -void XCCudaData::allocate_static_data( size_t _natoms, - size_t _n_deriv, - size_t _nbf, - size_t _nshells ) { - - - // Save state - nshells = _nshells; - nbf = _nbf; - n_deriv = _n_deriv; - natoms = _natoms; - - LDatoms = util::div_ceil( natoms, cuda::weight_unroll ) * cuda::weight_unroll; - - // Allocate static memory with proper alignment - buffer_adaptor mem( device_ptr, devmem_sz ); - - shells_device = mem.aligned_alloc>( nshells ); - exc_device = mem.aligned_alloc( 1 ); - nel_device = mem.aligned_alloc( 1 ); - acc_scr_device = mem.aligned_alloc( 1 ); - rab_device = mem.aligned_alloc( LDatoms * natoms, sizeof(double2)); - coords_device = mem.aligned_alloc( 3 * natoms ); - - vxc_device = mem.aligned_alloc( nbf * nbf ); - dmat_device = mem.aligned_alloc( nbf * nbf ); - - // Get current stack location - dynmem_ptr = mem.stack(); - dynmem_sz = mem.nleft(); - -} - - - - -using task_iterator = std::vector< XCTask >::iterator; -template -using device_task_container = std::vector< cuda::XCTaskDevice >; - -template -std::tuple< typename XCCudaData::task_iterator, - typename XCCudaData::device_task_container > - XCCudaData::generate_buffers( const BasisSet& basis, - task_iterator task_begin, - task_iterator task_end ) { - - // Host data packing arrays - std::vector< std::array > points_pack; - std::vector< double > weights_pack; - std::vector< size_t > shell_list_pack; - std::vector< size_t > shell_offs_pack; - std::vector< std::array > submat_cut_pack; - std::vector< int32_t > submat_block_pack; - std::vector< int32_t > iparent_pack; - std::vector< double > dist_nearest_pack; - - // Host copies for batched GEMM/SYRK arrays - std::vector< double* > dmat_array, bf_array, zmat_array; - std::vector< int > m_array, n_array, k_array, lda_array, ldb_array, ldc_array; - - device_task_container tasks_device; - - - auto concat_iterable = []( auto& a, const auto& b ) { - a.insert( a.end(), b.begin(), b.end() ); - }; - - - size_t ntask = 0; - size_t total_npts = 0; - size_t total_nbe_nbe = 0; - size_t total_nbe_npts = 0; - size_t total_nshells = 0; - size_t total_ncut = 0; - size_t total_nblock = 0; - size_t memleft = dynmem_sz; - - uint32_t submat_chunk_size = cuda::get_submat_cut_block(nbf, 0); - - // Offset memory by the static requirement of an extra pointer element - // for each of the size batch arrays in MAGMA - memleft -= 6 * sizeof(int); //M,N,K,LDA,LDB,LDC - - auto task_it = task_begin; - while( task_it != task_end ) { - - auto iAtom = task_it->iParent; - auto points = task_it->points ; - auto weights = task_it->weights ; - auto shell_list = task_it->shell_list; - auto nbe = task_it->nbe; - auto dist_nearest = task_it->dist_nearest; - - // Generate map from compressed to non-compressed matrices - auto [submat_cut, submat_block] = integrator::gen_compressed_submat_map( basis, shell_list, nbf, submat_chunk_size ); - size_t ncut = submat_cut.size(); - size_t nblock = submat_block.size(); - size_t nshells = shell_list.size(); - size_t npts = points.size(); - - - size_t mem_points = 3 * npts; - size_t mem_weights = npts; - - size_t mem_shells = nshells; - size_t mem_shell_list = nshells; - size_t mem_shell_offs = nshells; - size_t mem_submat_cut = 3 * ncut; - size_t mem_submat_block = nblock; - - size_t mem_nbe_scr = nbe * nbe; - size_t mem_zmat = nbe * npts; - - size_t mem_bf = nbe * npts; - size_t mem_dbfx = mem_bf; - size_t mem_dbfy = mem_bf; - size_t mem_dbfz = mem_bf; - - size_t mem_den = npts; - size_t mem_denx = npts; - size_t mem_deny = npts; - size_t mem_denz = npts; - - size_t mem_eps = npts; - size_t mem_gamma = npts; - size_t mem_vrho = npts; - size_t mem_vgamma = npts; - - //size_t mem_partition_scr = natoms * npts; - size_t mem_dist_scr = LDatoms * npts; - size_t mem_iparent = npts; - size_t mem_dist_nearest = npts; - - size_t mem_batch_mat_arr = 3; // dmat/zmat/bf - size_t mem_batch_sz_arr = 6; // M/N/K/LDA/LDB/LDC - size_t mem_task = 1; - - - size_t mem_req_batch = - mem_points * sizeof(double) + - mem_weights * sizeof(double) + - mem_shells * sizeof(Shell) + - mem_shell_list * sizeof(size_t) + - mem_shell_offs * sizeof(size_t) + - mem_submat_cut * sizeof(int32_t) + - mem_submat_block * sizeof(int32_t) + - mem_nbe_scr * sizeof(double) + - mem_zmat * sizeof(double) + - mem_bf * sizeof(double) + - mem_dbfx * sizeof(double) + - mem_dbfy * sizeof(double) + - mem_dbfz * sizeof(double) + - mem_den * sizeof(double) + - mem_denx * sizeof(double) + - mem_deny * sizeof(double) + - mem_denz * sizeof(double) + - mem_eps * sizeof(double) + - mem_gamma * sizeof(double) + - mem_vrho * sizeof(double) + - mem_vgamma * sizeof(double) + - //mem_partition_scr * sizeof(double) + - mem_dist_scr * sizeof(double) + - mem_iparent * sizeof(int32_t) + - mem_dist_nearest * sizeof(double) + - mem_batch_mat_arr * sizeof(double*) + - mem_batch_sz_arr * sizeof(int32_t) + - mem_task * sizeof(cuda::XCTaskDevice); - - //std::cout << "Memory requirement for task " << ntask+1 << " " << mem_req_batch << " memleft " << memleft << std::endl; - - if( mem_req_batch > memleft ) break; - - // Update memory and increment task iterator - memleft -= mem_req_batch; - ntask++; - task_it++; - - // Update counters - total_npts += npts; - total_nbe_nbe += nbe*nbe; - total_nbe_npts += nbe*npts; - total_nshells += nshells; - total_ncut += ncut; - total_nblock += nblock; - - // Compute offsets - std::vector< size_t > shell_offs( nshells ); - shell_offs.at(0) = 0; - for( auto i = 1ul; i < nshells; ++i ) - shell_offs.at(i) = shell_offs.at(i-1) + - basis.at( shell_list.at(i-1) ).size(); - - - // Pack the data on host - concat_iterable( points_pack, points ); - concat_iterable( weights_pack, weights ); - concat_iterable( shell_list_pack, shell_list ); - concat_iterable( shell_offs_pack, shell_offs ); - concat_iterable( submat_cut_pack, submat_cut ); - concat_iterable( submat_block_pack, submat_block ); - - m_array.emplace_back( npts ); - n_array.emplace_back( nbe ); - k_array.emplace_back( nbe ); - - lda_array.emplace_back( nbe ); - ldb_array.emplace_back( npts ); - ldc_array.emplace_back( npts ); - - iparent_pack.insert( iparent_pack.end(), npts, iAtom ); - dist_nearest_pack.insert( dist_nearest_pack.end(), npts, dist_nearest ); - - // Add task - tasks_device.emplace_back(); - - tasks_device.back().nbe = nbe; - tasks_device.back().npts = npts; - tasks_device.back().ncut = ncut; - tasks_device.back().nblock = nblock; - tasks_device.back().nshells = nshells; - tasks_device.back().iParent = iAtom; - tasks_device.back().dist_nearest = dist_nearest; - } - - - std::cout << "XCDeviceData will stack allocate for " << tasks_device.size() << " tasks"; - std::cout << " Using chunk size of " << submat_chunk_size << std::endl; - - // Allocate out of dynamic memory - buffer_adaptor mem( dynmem_ptr, dynmem_sz ); - - // (possibly) Large types - important_shells_device = mem.aligned_alloc>( total_nshells ); - device_tasks = mem.aligned_alloc>( ntask ); - - // 64-bit types - nbe_scr_device = mem.aligned_alloc( total_nbe_nbe ); - zmat_device = mem.aligned_alloc( total_nbe_npts ); - bf_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_x_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_y_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_z_eval_device = mem.aligned_alloc( total_nbe_npts ); - - den_eval_device = mem.aligned_alloc( total_npts ); - eps_eval_device = mem.aligned_alloc( total_npts ); - vrho_eval_device = mem.aligned_alloc( total_npts ); - - den_x_eval_device = mem.aligned_alloc( total_npts ); - den_y_eval_device = mem.aligned_alloc( total_npts ); - den_z_eval_device = mem.aligned_alloc( total_npts ); - gamma_eval_device = mem.aligned_alloc( total_npts ); - vgamma_eval_device = mem.aligned_alloc( total_npts ); - - points_device_buffer = mem.aligned_alloc( 3 * total_npts ); - weights_device_buffer = mem.aligned_alloc( total_npts ); - shell_list_device_buffer = mem.aligned_alloc( total_nshells ); - shell_offs_device_buffer = mem.aligned_alloc( total_nshells ); - submat_cut_device_buffer = mem.aligned_alloc( 3 * total_ncut ); - submat_block_device_buffer = mem.aligned_alloc( total_nblock ); - - dist_scratch_device = mem.aligned_alloc( LDatoms * total_npts, 2 * sizeof(double) ); - dist_nearest_buffer = mem.aligned_alloc( total_npts ); - - dmat_array_device = mem.aligned_alloc( ntask ); - zmat_array_device = mem.aligned_alloc( ntask ); - bf_array_device = mem.aligned_alloc( ntask ); - - // 32-bit types - m_array_device = mem.aligned_alloc( ntask + 1 ); - n_array_device = mem.aligned_alloc( ntask + 1 ); - k_array_device = mem.aligned_alloc( ntask + 1 ); - lda_array_device = mem.aligned_alloc( ntask + 1 ); - ldb_array_device = mem.aligned_alloc( ntask + 1 ); - ldc_array_device = mem.aligned_alloc( ntask + 1 ); - - iparent_device_buffer = mem.aligned_alloc( total_npts ); - - - // Update tasks with allocated pointers - { - double* points_ptr = points_device_buffer; - double* weights_ptr = weights_device_buffer; - - size_t* shell_list_ptr = shell_list_device_buffer; - size_t* shell_offs_ptr = shell_offs_device_buffer; - int32_t* submat_cut_ptr = submat_cut_device_buffer; - int32_t* submat_block_ptr = submat_block_device_buffer; - Shell * shells_ptr = important_shells_device; - double* nbe_ptr = nbe_scr_device; - double* zmat_ptr = zmat_device; - - double* bf_ptr = bf_eval_device; - double* dbfx_ptr = dbf_x_eval_device; - double* dbfy_ptr = dbf_y_eval_device; - double* dbfz_ptr = dbf_z_eval_device; - - double* den_ptr = den_eval_device; - double* ddenx_ptr = den_x_eval_device; - double* ddeny_ptr = den_y_eval_device; - double* ddenz_ptr = den_z_eval_device; - - double* eps_ptr = eps_eval_device; - double* gamma_ptr = gamma_eval_device; - double* vrho_ptr = vrho_eval_device; - double* vgamma_ptr = vgamma_eval_device; - - - double* dist_scratch_ptr = dist_scratch_device; - - for( auto& task : tasks_device ) { - - task.points = points_ptr; - task.weights = weights_ptr; - task.shell_list = shell_list_ptr; - task.shell_offs = shell_offs_ptr; - task.submat_cut = submat_cut_ptr; - task.submat_block = submat_block_ptr; - - task.shells = shells_ptr; - task.nbe_scr = nbe_ptr; - task.zmat = zmat_ptr; - task.bf = bf_ptr; - task.dbfx = dbfx_ptr; - task.dbfy = dbfy_ptr; - task.dbfz = dbfz_ptr; - task.den = den_ptr; - task.ddenx = ddenx_ptr; - task.ddeny = ddeny_ptr; - task.ddenz = ddenz_ptr; - - task.eps = eps_ptr; - task.gamma = gamma_ptr; - task.vrho = vrho_ptr; - task.vgamma = vgamma_ptr; - - task.dist_scratch = dist_scratch_ptr; - - auto npts = task.npts; - auto nbe = task.nbe; - auto nshells = task.nshells; - auto ncut = task.ncut; - auto nblock = task.nblock; - - points_ptr += 3 * npts; - weights_ptr += npts; - shell_list_ptr += nshells; - shell_offs_ptr += nshells; - submat_cut_ptr += 3 * ncut; - submat_block_ptr += nblock; - - shells_ptr += nshells; - nbe_ptr += nbe * nbe; - zmat_ptr += nbe * npts; - - bf_ptr += nbe * npts; - dbfx_ptr += nbe * npts; - dbfy_ptr += nbe * npts; - dbfz_ptr += nbe * npts; - - den_ptr += npts; - ddenx_ptr += npts; - ddeny_ptr += npts; - ddenz_ptr += npts; - - eps_ptr += npts; - gamma_ptr += npts; - vrho_ptr += npts; - vgamma_ptr += npts; - - dist_scratch_ptr += LDatoms * npts; - - - - // Batched LA - dmat_array.emplace_back( task.nbe_scr ); - bf_array.emplace_back( task.bf ); - zmat_array.emplace_back( task.zmat ); - } - - } // End task setup - - - - - auto copy_rev = [&]( size_t n, const auto* src, auto* dest, cudaStream_t stream, - std::string m ) { - util::cuda_copy_async( n, dest, src, stream, m ); - }; - - - - try { - - // Send the data to the device - copy_rev( 3*points_pack.size(), points_pack.data()->data(), - points_device_buffer, *master_stream, - "send points buffer" ); - copy_rev( weights_pack.size(), weights_pack.data(), - weights_device_buffer, *master_stream, - "send weights buffer" ); - - copy_rev( shell_list_pack.size(), shell_list_pack.data(), - shell_list_device_buffer, *master_stream, - "send_shell_list_buffer" ); - copy_rev( shell_offs_pack.size(), shell_offs_pack.data(), - shell_offs_device_buffer, *master_stream, - "send_shell_offs_buffer" ); -// std::cout << "Element size " << sizeof(std::get<0>(submat_cut_pack[0]) << std::endl; - copy_rev( 3 * submat_cut_pack.size(), submat_cut_pack.data()->data(), - submat_cut_device_buffer, *master_stream, - "send_submat_cut_buffer" ); - copy_rev( submat_block_pack.size(), submat_block_pack.data(), - submat_block_device_buffer, *master_stream, - "send_submat_block_buffer" ); - - copy_rev( tasks_device.size(), tasks_device.data(), device_tasks, - *master_stream, "send_tasks_device" ); - - - copy_rev( dmat_array.size(), dmat_array.data(), dmat_array_device, - *master_stream, "send dmat_array" ); - copy_rev( zmat_array.size(), zmat_array.data(), zmat_array_device, - *master_stream, "send zmat_array" ); - copy_rev( bf_array.size(), bf_array.data(), bf_array_device, - *master_stream, "send bf_array" ); - - copy_rev( m_array.size(), m_array.data(), m_array_device, - *master_stream, "send m_array" ); - copy_rev( n_array.size(), n_array.data(), n_array_device, - *master_stream, "send n_array" ); - copy_rev( k_array.size(), k_array.data(), k_array_device, - *master_stream, "send k_array" ); - - copy_rev( lda_array.size(), lda_array.data(), lda_array_device, - *master_stream, "send lda_array" ); - copy_rev( ldb_array.size(), ldb_array.data(), ldb_array_device, - *master_stream, "send ldb_array" ); - copy_rev( ldc_array.size(), ldc_array.data(), ldc_array_device, - *master_stream, "send ldc_array" ); - - copy_rev( iparent_pack.size(), iparent_pack.data(), - iparent_device_buffer, *master_stream, "send iparent" ); - copy_rev( dist_nearest_pack.size(), dist_nearest_pack.data(), - dist_nearest_buffer, *master_stream, "send dist_nearest" ); - - } catch(...) { - //teardown_(); throw; - throw; - } - - - // To avoid packed vectors going out of scope - cudaStreamSynchronize( *master_stream ); - - return std::make_tuple(task_it, tasks_device); -} - - -// Explicit Instantiations -template struct XCCudaData; - -} diff --git a/third_party/gauxc/attic/src/integrator/host/blas.cxx b/third_party/gauxc/attic/src/integrator/host/blas.cxx deleted file mode 100644 index b7126e4..0000000 --- a/third_party/gauxc/attic/src/integrator/host/blas.cxx +++ /dev/null @@ -1,214 +0,0 @@ -#include "host/blas.hpp" -#include -#include - -extern "C" { - -//void dlacpy_( const char* UPLO, const int* M, const int* N, const double* A, -// const int* LDA, double* B, const int* LDB ); -//void slacpy_( const char* UPLO, const int* M, const int* N, const float* A, -// const int* LDA, float* B, const int* LDB ); - -void dgemm_( const char* TA, const char* TB, const int* M, const int* N, - const int* K, const double* ALPHA, const double* A, - const int* LDA, const double* B, const int* LDB, - const double* BETA, double* C, const int* LDC ); -void sgemm_( const char* TA, const char* TB, const int* M, const int* N, - const int* K, const float* ALPHA, const float* A, - const int* LDA, const float* B, const int* LDB, - const float* BETA, float* C, const int* LDC ); - -void dsyr2k_( const char* UPLO, const char* TRANS, const int* N, const int* K, - const double* ALPHA, const double* A, const int* LDA, const double* B, - const int* LDB, const double* BETA, double* C, const int* LDC ); -void ssyr2k_( const char* UPLO, const char* TRANS, const int* N, const int* K, - const float* ALPHA, const float* A, const int* LDA, const float* B, - const int* LDB, const float* BETA, float* C, const int* LDC ); - -double ddot_( const int* N, const double* X, const int* INCX, const double* Y, - const int* INCY ); -float sdot_( const int* N, const float* X, const int* INCX, const float* Y, - const int* INCY ); - - -void daxpy_( const int* N, const double* ALPHA, const double* A, const int* INCX, - double* Y, const int* INCY ); -void saxpy_( const int* N, const float* ALPHA, const float* A, const int* INCX, - float* Y, const int* INCY ); - -void dscal_( const int* N, const double* ALPHA, const double* X, const int* INCX ); -void sscal_( const int* N, const float* ALPHA, const float* X, const int* INCX ); -} - -namespace GauXC::blas { - -template -void lacpy( char UPLO, int M, int N, const T* A, int LDA, T* B, - int LDB ) { - -/* - if constexpr ( std::is_same_v ) - slacpy_( &UPLO, &M, &N, A, &LDA, B, &LDB ); - else if constexpr ( std::is_same_v ) - dlacpy_( &UPLO, &M, &N, A, &LDA, B, &LDB ); - else throw std::runtime_error("LACPY NYI"); -*/ - - if( UPLO == 'L' ) { - - for( int j = 0; j < N; ++j ) - for( int i = j; i < M; ++i ) - B[i + j*LDB] = A[i + j*LDA]; - - } else if( UPLO == 'U' ) { - - for( int j = 0; j < N; ++j ) - for( int i = 0; i <= j; ++i ) - B[i + j*LDB] = A[i + j*LDA]; - - } else { - - for( int j = 0; j < N; ++j ) - for( int i = 0; i < M; ++i ) - B[i + j*LDB] = A[i + j*LDA]; - - } - -} - -template void lacpy( char UPLO, int M, int N, const float* A, int LDA, - float* B, int LDB ); -template void lacpy( char UPLO, int M, int N, const double* A, int LDA, - double* B, int LDB ); - - - - - - - - - -template -void gemm( char TA, char TB, int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ) { - - - if constexpr ( std::is_same_v ) - sgemm_( &TA, &TB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else if constexpr ( std::is_same_v ) - dgemm_( &TA, &TB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else throw std::runtime_error("GEMM NYI"); - - -} -template -void gemm( char floatA, char floatB, int M, int N, int K, float ALPHA, - const float* A, int LDA, const float* B, int LDB, float BETA, - float* C, int LDC ); -template -void gemm( char doubleA, char doubleB, int M, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, double BETA, - double* C, int LDC ); - - - - - - - -template -void syr2k( char UPLO, char TRANS, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ) { - - - if constexpr ( std::is_same_v ) - ssyr2k_( &UPLO, &TRANS, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else if constexpr ( std::is_same_v ) - dsyr2k_( &UPLO, &TRANS, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else throw std::runtime_error("SYR2K NYI"); - - -} - -template -void syr2k( char UPLO, char floatRANS, int N, int K, float ALPHA, - const float* A, int LDA, const float* B, int LDB, float BETA, - float* C, int LDC ); -template -void syr2k( char UPLO, char doubleRANS, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, double BETA, - double* C, int LDC ); - - - - - - - -template -T dot( int N, const T* X, int INCX, const T* Y, int INCY ) { - - if constexpr ( std::is_same_v ) - return sdot_(&N, X, &INCX, Y, &INCY); - else if constexpr ( std::is_same_v ) - return ddot_(&N, X, &INCX, Y, &INCY); - else throw std::runtime_error("DOT NYI"); - - return 0.; -} - -template -float dot( int N, const float* X, int INCX, const float* Y, int INCY ); -template -double dot( int N, const double* X, int INCX, const double* Y, int INCY ); - - - - - - -template -void axpy( int N, T ALPHA, const T* X, int INCX, T* Y, int INCY ) { - - if constexpr ( std::is_same_v ) - saxpy_(&N, &ALPHA, X, &INCX, Y, &INCY ); - else if constexpr ( std::is_same_v ) - daxpy_(&N, &ALPHA, X, &INCX, Y, &INCY ); - else throw std::runtime_error("AXPY NYI"); - -} - -template -void axpy( int N, float ALPHA, const float* A, int INCX, float* Y, - int INCY ); -template -void axpy( int N, double ALPHA, const double* A, int INCX, double* Y, - int INCY ); - - - - - - -template -void scal( int N, T ALPHA, T* X, int INCX ) { - - if constexpr ( std::is_same_v ) - sscal_(&N, &ALPHA, X, &INCX ); - else if constexpr ( std::is_same_v ) - dscal_(&N, &ALPHA, X, &INCX ); - else throw std::runtime_error("SCAL NYI"); - -} - -template -void scal( int N, float ALPHA, float* X, int INCX ); -template -void scal( int N, double ALPHA, double* X, int INCX ); - -} - - diff --git a/third_party/gauxc/attic/src/integrator/host/blas.hpp b/third_party/gauxc/attic/src/integrator/host/blas.hpp deleted file mode 100644 index add036a..0000000 --- a/third_party/gauxc/attic/src/integrator/host/blas.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include - -namespace GauXC::blas { - -template -void lacpy( char UPLO, int M, int N, const T* A, int LDA, T* B, - int LDB ); - -template -void gemm( char TA, char TB, int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ); - -template -void syr2k( char UPLO, char TRANS, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ); - - -template -T dot( int N, const T* X, int INCX, const T* Y, int INCY ); - -template -void axpy( int N, T ALPHA, const T* X, int INCX, T* Y, int INCY ); - -template -void scal( int N, T ALPHA, T* X, int INCX ); - -} diff --git a/third_party/gauxc/attic/src/integrator/host/gauxc-host_integrator.cmake b/third_party/gauxc/attic/src/integrator/host/gauxc-host_integrator.cmake deleted file mode 100644 index 85600fd..0000000 --- a/third_party/gauxc/attic/src/integrator/host/gauxc-host_integrator.cmake +++ /dev/null @@ -1,15 +0,0 @@ -find_package( LAPACK REQUIRED ) -include( gauxc-gau2grid ) -target_sources( gauxc PRIVATE host/xc_host_util.cxx - host/host_weights.cxx - host/host_collocation.cxx - host/host_zmat.cxx - host/blas.cxx -) - -target_link_libraries( gauxc PUBLIC LAPACK::LAPACK ) - -if( GAUXC_ENABLE_GAU2GRID ) - target_link_libraries( gauxc PUBLIC gau2grid::gg ) -endif() - diff --git a/third_party/gauxc/attic/src/integrator/host/host_collocation.cxx b/third_party/gauxc/attic/src/integrator/host/host_collocation.cxx deleted file mode 100644 index 8edf654..0000000 --- a/third_party/gauxc/attic/src/integrator/host/host_collocation.cxx +++ /dev/null @@ -1,137 +0,0 @@ -#include "host/host_collocation.hpp" - - -#ifdef GAUXC_ENABLE_GAU2GRID - #include "gau2grid/gau2grid.h" -#else - #include "collocation/collocation_angular_cartesian.hpp" - #include "collocation/collocation_angular_spherical_unnorm.hpp" - #include "collocation/collocation_radial.hpp" -#endif - -namespace GauXC::integrator::host { - -void eval_collocation( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval ) { - -#ifdef GAUXC_ENABLE_GAU2GRID - - std::allocator a; - auto* rv = a.allocate( npts * nbe ); - - size_t ncomp = 0; - for( size_t i = 0; i < nshells; ++i ) { - - const auto& sh = basis.at(shell_mask[i]); - int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; - gg_collocation( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(), - sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts ); - - ncomp += sh.size(); - - } - - gg_fast_transpose( ncomp, npts, rv, basis_eval ); - a.deallocate( rv, npts*nbe ); - -#else - - for( size_t ipt = 0; ipt < npts; ++ipt ) - for( size_t i = 0; i < nshells; ++i ) { - - const auto ish = shell_mask[i]; - const auto& sh = basis.at(ish); - auto* eval = basis_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - - double x,y,z, bf; - integrator::cuda::collocation_device_radial_eval( sh, points + 3*ipt, - &x, &y, &z, &bf ); - - if( sh.pure() ) - integrator::cuda::collocation_spherical_unnorm_angular( sh.l(), bf, x, y, z, - eval ); - else - integrator::cuda::collocation_cartesian_angular( sh.l(), bf, x, y, z, eval ); - - - } - -#endif - -} - -void eval_collocation_deriv1( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval, - double* dbasis_x_eval, - double* dbasis_y_eval, - double* dbasis_z_eval ) { - -#ifdef GAUXC_ENABLE_GAU2GRID - - std::allocator a; - auto* rv = a.allocate( 4 * npts * nbe ); - auto* rv_x = rv + npts * nbe; - auto* rv_y = rv_x + npts * nbe; - auto* rv_z = rv_y + npts * nbe; - - size_t ncomp = 0; - for( size_t i = 0; i < nshells; ++i ) { - - const auto& sh = basis.at(shell_mask[i]); - int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; - gg_collocation_deriv1( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(), - sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts, - rv_x + ncomp*npts, rv_y + ncomp*npts, rv_z + ncomp*npts ); - - ncomp += sh.size(); - - } - - gg_fast_transpose( ncomp, npts, rv, basis_eval ); - gg_fast_transpose( ncomp, npts, rv_x, dbasis_x_eval ); - gg_fast_transpose( ncomp, npts, rv_y, dbasis_y_eval ); - gg_fast_transpose( ncomp, npts, rv_z, dbasis_z_eval ); - - a.deallocate( rv, 4*npts*nbe ); - -#else - - for( size_t ipt = 0; ipt < npts; ++ipt ) - for( size_t i = 0; i < nshells; ++i ) { - - const auto ish = shell_mask[i]; - const auto& sh = basis.at(ish); - auto* eval = basis_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - auto* deval_x = dbasis_x_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - auto* deval_y = dbasis_y_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - auto* deval_z = dbasis_z_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - - double x,y,z, bf, dbf_x, dbf_y, dbf_z; - integrator::cuda::collocation_device_radial_eval_deriv1( sh, points + 3*ipt, - &x, &y, &z, &bf, &dbf_x, - &dbf_y, &dbf_z); - - if( sh.pure() ) - integrator::cuda::collocation_spherical_unnorm_angular_deriv1( - sh.l(), bf, dbf_x, dbf_y, dbf_z, x, y, z, eval, deval_x, deval_y, deval_z ); - else - integrator::cuda::collocation_cartesian_angular_deriv1( - sh.l(), bf, dbf_x, dbf_y, dbf_z, x, y, z, eval, deval_x, deval_y, deval_z ); - - } - -#endif -} - - -} diff --git a/third_party/gauxc/attic/src/integrator/host/host_collocation.hpp b/third_party/gauxc/attic/src/integrator/host/host_collocation.hpp deleted file mode 100644 index 536ba26..0000000 --- a/third_party/gauxc/attic/src/integrator/host/host_collocation.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - -namespace GauXC::integrator::host { - -void eval_collocation( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval ); - -void eval_collocation_deriv1( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval, - double* dbasis_x_eval, - double* dbasis_y_eval, - double* dbasis_z_eval ); - -} diff --git a/third_party/gauxc/attic/src/integrator/host/host_weights.cxx b/third_party/gauxc/attic/src/integrator/host/host_weights.cxx deleted file mode 100644 index 51c24f7..0000000 --- a/third_party/gauxc/attic/src/integrator/host/host_weights.cxx +++ /dev/null @@ -1,205 +0,0 @@ -#include "host/host_weights.hpp" -#include - -namespace GauXC::integrator::host { - -void ssf_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -); - -void becke_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -); - -void partition_weights_host( - XCWeightAlg weight_alg, - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -) { - - switch( weight_alg ) { - case XCWeightAlg::Becke: - becke_weights_host( mol, meta, tasks ); - break; - case XCWeightAlg::SSF: - ssf_weights_host( mol, meta, tasks ); - break; - default: - throw std::runtime_error("Weight Alg Not Supported"); - } - -} - -void becke_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -) { - - // Becke partition functions - auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 - auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 - - const size_t ntasks = tasks.size(); - const size_t natoms = mol.natoms(); - - const auto& RAB = meta.rab(); - - #pragma omp parallel - { - - std::vector partitionScratch( natoms ); - std::vector atomDist( natoms ); - - #pragma omp for - for( size_t iT = 0; iT < ntasks; ++iT ) - for( size_t i = 0; i < tasks[iT].points.size(); ++i ) { - - auto& task = tasks[iT]; - auto& weight = task.weights[i]; - const auto& point = task.points[i]; - - // Compute distances of each center to point - for(size_t iA = 0; iA < natoms; iA++) { - - const double da_x = point[0] - mol[iA].x; - const double da_y = point[1] - mol[iA].y; - const double da_z = point[2] - mol[iA].z; - - atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); - - } - - // Evaluate unnormalized partition functions - std::fill(partitionScratch.begin(),partitionScratch.end(),1.); - for( size_t iA = 0; iA < natoms; iA++ ) - for( size_t jA = 0; jA < iA; jA++ ){ - const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; - const double g = gBecke(mu); - - partitionScratch[iA] *= 0.5 * (1. - g); - partitionScratch[jA] *= 0.5 * (1. + g); - } - - // Normalization - double sum = 0.; - for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; - - // Update Weights - weight *= partitionScratch[task.iParent] / sum; - - } // Collapsed loop over tasks and points - - } // OMP context - - -} - -void ssf_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -) { - - auto gFrisch = [&](double x) { - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - - const size_t ntasks = tasks.size(); - const size_t natoms = mol.natoms(); - - const auto& RAB = meta.rab(); - - #pragma omp parallel - { - - std::vector partitionScratch( natoms ); - std::vector atomDist( natoms ); - - #pragma omp for - for( size_t iT = 0; iT < ntasks; ++iT ) - for( size_t i = 0; i < tasks[iT].points.size(); ++i ) { - - auto& task = tasks[iT]; - auto& weight = task.weights[i]; - const auto& point = task.points[i]; - - const auto dist_cutoff = 0.5 * (1-magic_ssf_factor<>) * task.dist_nearest; - - // Compute dist to parent atom - { - const double da_x = point[0] - mol[task.iParent].x; - const double da_y = point[1] - mol[task.iParent].y; - const double da_z = point[2] - mol[task.iParent].z; - - atomDist[task.iParent] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); - } - - if( atomDist[task.iParent] < dist_cutoff ) continue; // Partition weight = 1 - - // Compute distances of each center to point - for(size_t iA = 0; iA < natoms; iA++) { - - if( iA == (size_t)task.iParent ) continue; - - const double da_x = point[0] - mol[iA].x; - const double da_y = point[1] - mol[iA].y; - const double da_z = point[2] - mol[iA].z; - - atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); - - } - - // Evaluate unnormalized partition functions - std::fill(partitionScratch.begin(),partitionScratch.end(),1.); - for( size_t iA = 0; iA < natoms; iA++ ) - for( size_t jA = 0; jA < iA; jA++ ) - if( partitionScratch[iA] > ssf_weight_tol or - partitionScratch[jA] > ssf_weight_tol ) { - - const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; - - if( mu <= -magic_ssf_factor<> ) { - - partitionScratch[jA] = 0.; - - } else if (mu >= magic_ssf_factor<>) { - - partitionScratch[iA] = 0.; - - } else { - - double g = 0.5 * ( 1. - gFrisch(mu) ); - partitionScratch[iA] *= g; - partitionScratch[jA] *= 1. - g; - - } - - } - - // Normalization - double sum = 0.; - for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; - - // Update Weights - weight *= partitionScratch[task.iParent] / sum; - - } // Collapsed loop over tasks and points - - } // OMP context - - -} - -} diff --git a/third_party/gauxc/attic/src/integrator/host/host_weights.hpp b/third_party/gauxc/attic/src/integrator/host/host_weights.hpp deleted file mode 100644 index 11736de..0000000 --- a/third_party/gauxc/attic/src/integrator/host/host_weights.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include - -namespace GauXC::integrator::host { - -void partition_weights_host( - XCWeightAlg weight_alg, - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -); - - -} diff --git a/third_party/gauxc/attic/src/integrator/host/host_zmat.cxx b/third_party/gauxc/attic/src/integrator/host/host_zmat.cxx deleted file mode 100644 index 52cd426..0000000 --- a/third_party/gauxc/attic/src/integrator/host/host_zmat.cxx +++ /dev/null @@ -1,115 +0,0 @@ -#include "host/host_zmat.hpp" -#include "host/blas.hpp" - -namespace GauXC { -namespace integrator::host { - -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* basis, - F* z_matrix ) { - - GauXC::blas::lacpy( 'A', nbf, npts, basis, nbf, - z_matrix, nbf ); - - for( int32_t i = 0; i < npts; ++i ) { - - auto* z_col = z_matrix + i*nbf; - - const F fact = 0.5 * vrho[i]; - GauXC::blas::scal( nbf, fact, z_col, 1 ); - - } - -} - -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const float* vrho, - const float* basis, - float* z_matrix ); -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const double* vrho, - const double* basis, - double* z_matrix ); - - - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* vgamma, - const F* basis, - const F* dbasis_x, - const F* dbasis_y, - const F* dbasis_z, - const F* dden_x, - const F* dden_y, - const F* dden_z, - F* z_matrix ) { - - GauXC::blas::lacpy( 'A', nbf, npts, basis, nbf, - z_matrix, nbf ); - - for( int32_t i = 0; i < npts; ++i ) { - - const int32_t ioff = i * nbf; - - auto* z_col = z_matrix + ioff; - auto* bf_x_col = dbasis_x + ioff; - auto* bf_y_col = dbasis_y + ioff; - auto* bf_z_col = dbasis_z + ioff; - - const F lda_fact = 0.5 * vrho[i]; - GauXC::blas::scal( nbf, lda_fact, z_col, 1 ); - - const F gga_fact = 2. * vgamma[i]; - const auto x_fact = gga_fact * dden_x[i]; - const auto y_fact = gga_fact * dden_y[i]; - const auto z_fact = gga_fact * dden_z[i]; - - GauXC::blas::axpy( nbf, x_fact, bf_x_col, 1, z_col, 1 ); - GauXC::blas::axpy( nbf, y_fact, bf_y_col, 1, z_col, 1 ); - GauXC::blas::axpy( nbf, z_fact, bf_z_col, 1, z_col, 1 ); - - } - -} - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const float* vrho, - const float* vgamma, - const float* basis, - const float* dbasis_x, - const float* dbasis_y, - const float* dbasis_z, - const float* dden_x, - const float* dden_y, - const float* dden_z, - float* z_matrix ); - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const double* vrho, - const double* vgamma, - const double* basis, - const double* dbasis_x, - const double* dbasis_y, - const double* dbasis_z, - const double* dden_x, - const double* dden_y, - const double* dden_z, - double* z_matrix ); - -} -} - diff --git a/third_party/gauxc/attic/src/integrator/host/host_zmat.hpp b/third_party/gauxc/attic/src/integrator/host/host_zmat.hpp deleted file mode 100644 index ba33541..0000000 --- a/third_party/gauxc/attic/src/integrator/host/host_zmat.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator::host { - -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* basis, - F* z_matrix ); - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* vgamma, - const F* basis, - const F* dbasis_x, - const F* dbasis_y, - const F* dbasis_z, - const F* dden_x, - const F* dden_y, - const F* dden_z, - F* z_matrix ); - -} -} diff --git a/third_party/gauxc/attic/src/integrator/host/util.hpp b/third_party/gauxc/attic/src/integrator/host/util.hpp deleted file mode 100644 index b23f66f..0000000 --- a/third_party/gauxc/attic/src/integrator/host/util.hpp +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once -#include "host/blas.hpp" -#include -#include -#include - -namespace GauXC { -namespace detail { - -template -void submat_set(int32_t M, int32_t N, int32_t MSub, - int32_t NSub, _F1 *ABig, int32_t LDAB, _F2 *ASmall, - int32_t LDAS, - std::vector> &submat_map) { - - (void)(M); - (void)(N); - (void)(MSub); - (void)(NSub); - - int32_t i(0); - for( auto& iCut : submat_map ) { - int32_t deltaI = iCut[1]; - int32_t j(0); - for( auto& jCut : submat_map ) { - int32_t deltaJ = jCut[1]; - - auto* ABig_use = ABig + iCut[0] + jCut[0] * LDAB; - auto* ASmall_use = ASmall + i + j * LDAS; - - - GauXC::blas::lacpy( 'A', deltaI, deltaJ, ABig_use, LDAB, - ASmall_use, LDAS ); - - - j += deltaJ; - } - i += deltaI; - } - - -} - -template -void inc_by_submat(int32_t M, int32_t N, int32_t MSub, - int32_t NSub, _F1 *ABig, int32_t LDAB, _F2 *ASmall, - int32_t LDAS, - std::vector> &submat_map) { - - (void)(M); - (void)(N); - (void)(MSub); - (void)(NSub); - - int32_t i(0); - for( auto& iCut : submat_map ) { - int32_t deltaI = iCut[1]; - int32_t j(0); - for( auto& jCut : submat_map ) { - int32_t deltaJ = jCut[1]; - - auto* ABig_use = ABig + iCut[0] + jCut[0] * LDAB; - auto* ASmall_use = ASmall + i + j * LDAS; - - - for( int32_t jj = 0; jj < deltaJ; ++jj ) - for( int32_t ii = 0; ii < deltaI; ++ii ) - ABig_use[ ii + jj * LDAB ] += ASmall_use[ ii + jj * LDAS ]; - - - j += deltaJ; - } - i += deltaI; - } - - -} - -} -} diff --git a/third_party/gauxc/attic/src/integrator/host/xc_host_util.cxx b/third_party/gauxc/attic/src/integrator/host/xc_host_util.cxx deleted file mode 100644 index 6f9f61d..0000000 --- a/third_party/gauxc/attic/src/integrator/host/xc_host_util.cxx +++ /dev/null @@ -1,211 +0,0 @@ -#include - -#include "host/host_weights.hpp" -#include "host/host_collocation.hpp" -#include "host/host_zmat.hpp" -#include "integrator_common.hpp" -#include "host/blas.hpp" -#include "host/util.hpp" - -namespace GauXC { -namespace integrator::host { - - - -template -void process_batches_host_replicated_p( - XCIntegratorState integrator_state, - XCWeightAlg weight_alg, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCHostData & host_data, - std::vector< XCTask >& tasks, - const F* P, - F* VXC, - F* exc, - F* n_el -) { - - const int32_t nbf = basis.nbf(); - - auto task_comparator = []( const XCTask& a, const XCTask& b ) { - return (a.points.size() * a.nbe) > (b.points.size() * b.nbe); - }; - std::sort( tasks.begin(), tasks.end(), task_comparator ); - - - if( not integrator_state.modified_weights_are_stored ) - partition_weights_host( weight_alg, mol, meta, tasks ); - - - std::fill( VXC, VXC + size_t(nbf)*nbf, F(0.) ); - *exc = 0.; - - size_t ntasks = tasks.size(); - for( size_t iT = 0; iT < ntasks; ++iT ) { - - auto& task = tasks[iT]; - - const int32_t npts = task.points.size(); - const int32_t nbe = task.nbe; - const int32_t nshells = task.shell_list.size(); - - const F* points = task.points.data()->data(); - const F* weights = task.weights.data(); - const int32_t* shell_list = task.shell_list.data(); - - F* basis_eval = host_data.basis_eval.data(); - F* den_eval = host_data.den_scr.data(); - F* nbe_scr = host_data.nbe_scr.data(); - F* zmat = host_data.zmat.data(); - - F* eps = host_data.eps.data(); - F* gamma = host_data.gamma.data(); - F* vrho = host_data.vrho.data(); - F* vgamma = host_data.vgamma.data(); - - F* dbasis_x_eval = nullptr; - F* dbasis_y_eval = nullptr; - F* dbasis_z_eval = nullptr; - F* dden_x_eval = nullptr; - F* dden_y_eval = nullptr; - F* dden_z_eval = nullptr; - - if( n_deriv > 0 ) { - dbasis_x_eval = basis_eval + npts * nbe; - dbasis_y_eval = dbasis_x_eval + npts * nbe; - dbasis_z_eval = dbasis_y_eval + npts * nbe; - dden_x_eval = den_eval + npts; - dden_y_eval = dden_x_eval + npts; - dden_z_eval = dden_y_eval + npts; - } - - - // Get the submatrix map for batch - auto [submat_map, foo] = gen_compressed_submat_map( basis, task.shell_list, nbf, nbf); - - - // Evaluate Collocation Matrix - if( n_deriv == 1 ) - eval_collocation_deriv1( npts, nshells, nbe, points, basis, shell_list, - basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval ); - else - eval_collocation( npts, nshells, nbe, points, basis, shell_list, basis_eval ); - - - // Extrat Submatrix - const F* den_ptr_use = P; - if( nbe != nbf ) { - detail::submat_set( nbf, nbf, nbe, nbe, P, nbf, nbe_scr, nbe, submat_map ); - den_ptr_use = nbe_scr; - } - - // Z = P * BF - GauXC::blas::gemm( 'N', 'N', nbe, npts, nbe, 1., den_ptr_use, nbe, - basis_eval, nbe, 0., zmat, nbe ); - - - // Evaluate the density - for( int32_t i = 0; i < npts; ++i ) { - - const size_t ioff = size_t(i) * nbe; - const F* zmat_i = zmat + ioff; - - den_eval[i] = - 2. * GauXC::blas::dot( nbe, basis_eval + ioff, 1, zmat_i, 1 ); - - if( n_deriv > 0 ) { - const F dx = - 4. * GauXC::blas::dot( nbe, dbasis_x_eval + ioff, 1, zmat_i, 1 ); - const F dy = - 4. * GauXC::blas::dot( nbe, dbasis_y_eval + ioff, 1, zmat_i, 1 ); - const F dz = - 4. * GauXC::blas::dot( nbe, dbasis_z_eval + ioff, 1, zmat_i, 1 ); - - dden_x_eval[i] = dx; - dden_y_eval[i] = dy; - dden_z_eval[i] = dz; - - gamma[i] = dx*dx + dy*dy + dz*dz; - } - - } - - - // Evaluate XC functional - if( func.is_gga() ) - func.eval_exc_vxc( npts, den_eval, gamma, eps, vrho, vgamma ); - else - func.eval_exc_vxc( npts, den_eval, eps, vrho ); - - - // Factor weights into XC results - for( int32_t i = 0; i < npts; ++i ) { - eps[i] *= weights[i]; - vrho[i] *= weights[i]; - } - - if( func.is_gga() ) - for( int32_t i = 0; i < npts; ++i ) vgamma[i] *= weights[i]; - - - - // Scalar integrations - if( n_el ) - for( int32_t i = 0; i < npts; ++i ) *n_el += weights[i] * den_eval[i]; - - for( int32_t i = 0; i < npts; ++i ) *exc += eps[i] * den_eval[i]; - - - // Assemble Z - if( func.is_gga() ) - zmat_gga_host( npts, nbe, vrho, vgamma, basis_eval, dbasis_x_eval, - dbasis_y_eval, dbasis_z_eval, dden_x_eval, dden_y_eval, - dden_z_eval, zmat ); - else - zmat_lda_host( npts, nbe, vrho, basis_eval, zmat ); - - - - // Update VXC XXX: Only LT - GauXC::blas::syr2k( 'L', 'N', nbe, npts, F(1.), basis_eval, - nbe, zmat, nbe, F(0.), nbe_scr, nbe ); - - - detail::inc_by_submat( nbf, nbf, nbe, nbe, VXC, nbf, nbe_scr, nbe, - submat_map ); - } - - // Symmetrize VXC - for( int32_t j = 0; j < nbf; ++j ) - for( int32_t i = j+1; i < nbf; ++i ) - VXC[ j + i*nbf ] = VXC[ i + j*nbf ]; - -} - - -#define HOST_IMPL( F, ND ) \ -template \ -void process_batches_host_replicated_p(\ - XCIntegratorState integrator_state, \ - XCWeightAlg weight_alg,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCHostData & host_data,\ - std::vector< XCTask >& local_work,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -HOST_IMPL( double, 0 ); -HOST_IMPL( double, 1 ); - -} -} diff --git a/third_party/gauxc/attic/src/integrator/integrator_common.cxx b/third_party/gauxc/attic/src/integrator/integrator_common.cxx deleted file mode 100644 index 0314c32..0000000 --- a/third_party/gauxc/attic/src/integrator/integrator_common.cxx +++ /dev/null @@ -1,133 +0,0 @@ -#include "integrator_common.hpp" - -#include -#include -#include -#include - -namespace GauXC { -namespace integrator { - -std::tuple< std::vector< std::array > , std::vector< int32_t > > - gen_compressed_submat_map( const BasisSet& basis, - const std::vector< int32_t >& shell_mask, - const int32_t LDA, const int32_t block_size ) { - - - std::vector< std::pair > submat_map; - - // Init as if there is no screening - submat_map.emplace_back( - basis.shell_to_ao_range( shell_mask.front() ).first, - basis.shell_to_ao_range( shell_mask.back() ).second - ); - - - for( auto sh_it = shell_mask.begin(); sh_it != shell_mask.end()-1; ++sh_it ) { - - if( *(sh_it+1) - *(sh_it) != 1 ) { - - submat_map.back().second = basis.shell_to_ao_range(*sh_it).second; - - submat_map.emplace_back( - basis.shell_to_ao_range( *(sh_it+1) ).first, - basis.shell_to_ao_range( shell_mask.back() ).second - ); - - } - - - - } - - - if( shell_mask.size() == 1 ) - submat_map.back().second = - basis.shell_to_ao_range(shell_mask[0]).second; - - - /* - * This code block does post-processing for the submatrix optimizations - * - * It first adds the index within the small matrix as another pair in the vector. - * This allows the kernel to process multiple cuts concurrently within the same - * task. Additionally, it adds artificial breaks in the cut at the given interval - * This is to reduce the amount of bookkeeping that the kernel is required to do. - * - * While the small matrix start indices are stored in the additional pair, the second - * value is blank as the delta can be reused from the big matrix start and stop points. - * - * It also creates an additional vector which stores the mapping from big matrix block - * to cut index. As a kernel only processes a single block of the big matrix, it can - * look up the starting and ending cut indices and ignore all other cuts. - * - */ - std::vector< std::array > submat_map_expand; - std::vector< int32_t > submat_block_idx; - submat_block_idx.push_back(0); - const int end_point = LDA; - - int cut_index = 0; - int cut_expand_index = 0; - int small_index = 0; - int delta; - for (int block_start = 0; block_start < end_point; block_start += block_size) { - const int block_end = block_start + block_size; - - int cut_start = submat_map[cut_index].first; - int cut_end = submat_map[cut_index].second; - while (cut_index < submat_map.size() && cut_start < block_end) { - if (cut_start < block_start && cut_end < block_start) { - // In this case the cut starts and stops before the block starts. - // This should never happen as the cut should already have been processed. - // But I included this case as a sanity check. - std::cout << "Something is wrong constructing the extended cut map " << std::endl; - } else if (cut_start < block_start && cut_end > block_end) { - // In this case, the cut spans the entire block. The cut index is not - // incremented because we need to process the rest of it. - delta = block_end - block_start; - submat_map_expand.push_back({block_start, delta, small_index}); - small_index += delta; - - cut_expand_index++; - break; - } else if (cut_start < block_start) { - // In this case the cut begins before the block, but ends within - // this block - delta = cut_end - block_start; - submat_map_expand.push_back({block_start, delta, small_index}); - small_index += delta; - - cut_index++; - cut_expand_index++; - } else if (cut_end > block_end) { - // In this case, the cut starts within the block, but extends - // into the next block. Again, the cut index is not incremented - delta = block_end - cut_start; - submat_map_expand.push_back({cut_start, delta, small_index}); - small_index += delta; - - cut_expand_index++; - break; - } else { - // In this case, the cut starts and ends within the block - delta = cut_end - cut_start; - submat_map_expand.push_back({cut_start, delta, small_index}); - small_index += delta; - - cut_index++; - cut_expand_index++; - } - - cut_start = submat_map[cut_index].first; - cut_end = submat_map[cut_index].second; - } - submat_block_idx.push_back(cut_expand_index); - } - return {submat_map_expand, submat_block_idx}; -} - - - -} -} diff --git a/third_party/gauxc/attic/src/integrator/integrator_common.hpp b/third_party/gauxc/attic/src/integrator/integrator_common.hpp deleted file mode 100644 index a674003..0000000 --- a/third_party/gauxc/attic/src/integrator/integrator_common.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "integrator_constants.hpp" -#include - -namespace GauXC { -namespace integrator { - -std::tuple< std::vector< std::array > , std::vector< int32_t > > - gen_compressed_submat_map( const BasisSet& basis_set, - const std::vector< int32_t >& shell_mask, - const int32_t LDA, const int32_t block_size ); - - -} -} diff --git a/third_party/gauxc/attic/src/integrator/integrator_constants.hpp b/third_party/gauxc/attic/src/integrator/integrator_constants.hpp deleted file mode 100644 index f7ee152..0000000 --- a/third_party/gauxc/attic/src/integrator/integrator_constants.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { - -template -constexpr F magic_ssf_factor = 0.64; - -constexpr double ssf_weight_tol = 1e-10; - -} -} diff --git a/third_party/gauxc/attic/src/load_balancer_defaults.hpp b/third_party/gauxc/attic/src/load_balancer_defaults.hpp deleted file mode 100644 index 0327437..0000000 --- a/third_party/gauxc/attic/src/load_balancer_defaults.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#include "load_balancer/host/replicated_load_balancer.hpp" -#include "load_balancer/cuda/replicated_load_balancer.hpp" - -namespace GauXC { -namespace detail { - -template -std::unique_ptr make_default_load_balancer(Args&&... args) { -//#ifdef GAUXC_ENABLE_CUDA -// return std::make_unique( std::forward(args)... ); -//#else - return std::make_unique( std::forward(args)... ); -//#endif -} - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/CMakeLists.txt b/third_party/gauxc/attic/src/new_integrator/CMakeLists.txt deleted file mode 100644 index 6f8ecdf..0000000 --- a/third_party/gauxc/attic/src/new_integrator/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -# Implementations of generic interfaces -target_sources( gauxc PRIVATE replicated_xc_integrator_impl.cxx ) - -target_include_directories( gauxc - PUBLIC - $ -) - -# Common Utilities -include( common/gauxc-common.cmake ) - -# Host Integrator Utilities -if( GAUXC_ENABLE_HOST ) - include( host/gauxc-host.cmake ) -endif() - -# Device Integrator Utilities -if( GAUXC_ENABLE_DEVICE ) - include( device/gauxc-device.cmake ) -endif() diff --git a/third_party/gauxc/attic/src/new_integrator/common/gauxc-common.cmake b/third_party/gauxc/attic/src/new_integrator/common/gauxc-common.cmake deleted file mode 100644 index 5ec2bdc..0000000 --- a/third_party/gauxc/attic/src/new_integrator/common/gauxc-common.cmake +++ /dev/null @@ -1,3 +0,0 @@ -# Common Integrator Utilities -target_sources( gauxc PRIVATE common/integrator_common.cxx ) - diff --git a/third_party/gauxc/attic/src/new_integrator/common/integrator_common.cxx b/third_party/gauxc/attic/src/new_integrator/common/integrator_common.cxx deleted file mode 100644 index b76b600..0000000 --- a/third_party/gauxc/attic/src/new_integrator/common/integrator_common.cxx +++ /dev/null @@ -1,133 +0,0 @@ -#include "integrator_common.hpp" - -#include -#include -#include -#include - -namespace GauXC { -namespace integrator { - -std::tuple< std::vector< std::array > , std::vector< int32_t > > - gen_compressed_submat_map( const BasisSetMap& basis_map, - const std::vector< int32_t >& shell_mask, - const int32_t LDA, const int32_t block_size ) { - - - std::vector< std::pair > submat_map; - - // Init as if there is no screening - submat_map.emplace_back( - basis_map.shell_to_ao_range( shell_mask.front() ).first, - basis_map.shell_to_ao_range( shell_mask.back() ).second - ); - - - for( auto sh_it = shell_mask.begin(); sh_it != shell_mask.end()-1; ++sh_it ) { - - if( *(sh_it+1) - *(sh_it) != 1 ) { - - submat_map.back().second = basis_map.shell_to_ao_range(*sh_it).second; - - submat_map.emplace_back( - basis_map.shell_to_ao_range( *(sh_it+1) ).first, - basis_map.shell_to_ao_range( shell_mask.back() ).second - ); - - } - - - - } - - - if( shell_mask.size() == 1 ) - submat_map.back().second = - basis_map.shell_to_ao_range(shell_mask[0]).second; - - - /* - * This code block does post-processing for the submatrix optimizations - * - * It first adds the index within the small matrix as another pair in the vector. - * This allows the kernel to process multiple cuts concurrently within the same - * task. Additionally, it adds artificial breaks in the cut at the given interval - * This is to reduce the amount of bookkeeping that the kernel is required to do. - * - * While the small matrix start indices are stored in the additional pair, the second - * value is blank as the delta can be reused from the big matrix start and stop points. - * - * It also creates an additional vector which stores the mapping from big matrix block - * to cut index. As a kernel only processes a single block of the big matrix, it can - * look up the starting and ending cut indices and ignore all other cuts. - * - */ - std::vector< std::array > submat_map_expand; - std::vector< int32_t > submat_block_idx; - submat_block_idx.push_back(0); - const int end_point = LDA; - - int cut_index = 0; - int cut_expand_index = 0; - int small_index = 0; - int delta; - for (int block_start = 0; block_start < end_point; block_start += block_size) { - const int block_end = block_start + block_size; - - int cut_start = submat_map[cut_index].first; - int cut_end = submat_map[cut_index].second; - while (cut_index < submat_map.size() && cut_start < block_end) { - if (cut_start < block_start && cut_end < block_start) { - // In this case the cut starts and stops before the block starts. - // This should never happen as the cut should already have been processed. - // But I included this case as a sanity check. - std::cout << "Something is wrong constructing the extended cut map " << std::endl; - } else if (cut_start < block_start && cut_end > block_end) { - // In this case, the cut spans the entire block. The cut index is not - // incremented because we need to process the rest of it. - delta = block_end - block_start; - submat_map_expand.push_back({block_start, delta, small_index}); - small_index += delta; - - cut_expand_index++; - break; - } else if (cut_start < block_start) { - // In this case the cut begins before the block, but ends within - // this block - delta = cut_end - block_start; - submat_map_expand.push_back({block_start, delta, small_index}); - small_index += delta; - - cut_index++; - cut_expand_index++; - } else if (cut_end > block_end) { - // In this case, the cut starts within the block, but extends - // into the next block. Again, the cut index is not incremented - delta = block_end - cut_start; - submat_map_expand.push_back({cut_start, delta, small_index}); - small_index += delta; - - cut_expand_index++; - break; - } else { - // In this case, the cut starts and ends within the block - delta = cut_end - cut_start; - submat_map_expand.push_back({cut_start, delta, small_index}); - small_index += delta; - - cut_index++; - cut_expand_index++; - } - - cut_start = submat_map[cut_index].first; - cut_end = submat_map[cut_index].second; - } - submat_block_idx.push_back(cut_expand_index); - } - return {submat_map_expand, submat_block_idx}; -} - - - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/common/integrator_common.hpp b/third_party/gauxc/attic/src/new_integrator/common/integrator_common.hpp deleted file mode 100644 index b9c3ed3..0000000 --- a/third_party/gauxc/attic/src/new_integrator/common/integrator_common.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "integrator_constants.hpp" -#include - -namespace GauXC { -namespace integrator { - -std::tuple< std::vector< std::array >, std::vector< int32_t > > - gen_compressed_submat_map( const BasisSetMap& basis_set, - const std::vector< int32_t >& shell_mask, - const int32_t LDA, const int32_t block_size ); - - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/common/integrator_constants.hpp b/third_party/gauxc/attic/src/new_integrator/common/integrator_constants.hpp deleted file mode 100644 index f7ee152..0000000 --- a/third_party/gauxc/attic/src/new_integrator/common/integrator_constants.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { - -template -constexpr F magic_ssf_factor = 0.64; - -constexpr double ssf_weight_tol = 1e-10; - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/buffer_adaptor.hpp b/third_party/gauxc/attic/src/new_integrator/device/buffer_adaptor.hpp deleted file mode 100644 index 130f1f1..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/buffer_adaptor.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -namespace GauXC { - -class buffer_adaptor { - - size_t nalloc_; - size_t nleft_; - void* top_; - void* stack_; - -public: - - buffer_adaptor() = delete; - - inline buffer_adaptor( void* ptr, size_t len ) : - nalloc_(len), - nleft_(len), - top_(ptr), - stack_(ptr) { } - - template - T* aligned_alloc( size_t len, - size_t align = alignof(T) ) { - - char* old_stack = (char*)stack_; - if( std::align( align, - len*sizeof(T), - stack_, - nleft_ ) ) { - - T* result = reinterpret_cast(stack_); - stack_ = (char*)stack_ + len*sizeof(T); - nleft_ -= std::distance( old_stack, - (char*)stack_ ); - return result; - - } - - throw std::bad_alloc(); - - } - - inline void* stack() const {return stack_;} - inline size_t nleft() const { return nleft_; } - -}; - - -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_angular_cartesian.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_angular_cartesian.hpp deleted file mode 100644 index 32088f5..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_angular_cartesian.hpp +++ /dev/null @@ -1,308 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace cuda { - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_0( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_0_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x; - - eval_y[npts * 0] = bf_y; - - eval_z[npts * 0] = bf_z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_1( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x; - eval[npts * 1] = bf*y; - eval[npts * 2] = bf*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_1_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf + bf_x*x; - eval_x[npts * 1] = bf_x*y; - eval_x[npts * 2] = bf_x*z; - - eval_y[npts * 0] = bf_y*x; - eval_y[npts * 1] = bf + bf_y*y; - eval_y[npts * 2] = bf_y*z; - - eval_z[npts * 0] = bf_z*x; - eval_z[npts * 1] = bf_z*y; - eval_z[npts * 2] = bf + bf_z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_2( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x*x; - eval[npts * 1] = bf*x*y; - eval[npts * 2] = bf*x*z; - eval[npts * 3] = bf*y*y; - eval[npts * 4] = bf*y*z; - eval[npts * 5] = bf*z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_2_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = x*(2*bf + bf_x*x); - eval_x[npts * 1] = y*(bf + bf_x*x); - eval_x[npts * 2] = z*(bf + bf_x*x); - eval_x[npts * 3] = bf_x*y*y; - eval_x[npts * 4] = bf_x*y*z; - eval_x[npts * 5] = bf_x*z*z; - - eval_y[npts * 0] = bf_y*x*x; - eval_y[npts * 1] = x*(bf + bf_y*y); - eval_y[npts * 2] = bf_y*x*z; - eval_y[npts * 3] = y*(2*bf + bf_y*y); - eval_y[npts * 4] = z*(bf + bf_y*y); - eval_y[npts * 5] = bf_y*z*z; - - eval_z[npts * 0] = bf_z*x*x; - eval_z[npts * 1] = bf_z*x*y; - eval_z[npts * 2] = x*(bf + bf_z*z); - eval_z[npts * 3] = bf_z*y*y; - eval_z[npts * 4] = y*(bf + bf_z*z); - eval_z[npts * 5] = z*(2*bf + bf_z*z); - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x*x*x; - eval[npts * 1] = bf*x*x*y; - eval[npts * 2] = bf*x*x*z; - eval[npts * 3] = bf*x*y*y; - eval[npts * 4] = bf*x*y*z; - eval[npts * 5] = bf*x*z*z; - eval[npts * 6] = bf*y*y*y; - eval[npts * 7] = bf*y*y*z; - eval[npts * 8] = bf*y*z*z; - eval[npts * 9] = bf*z*z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = x*x*(3*bf + bf_x*x); - eval_x[npts * 1] = x*y*(2*bf + bf_x*x); - eval_x[npts * 2] = x*z*(2*bf + bf_x*x); - eval_x[npts * 3] = y*y*(bf + bf_x*x); - eval_x[npts * 4] = y*z*(bf + bf_x*x); - eval_x[npts * 5] = z*z*(bf + bf_x*x); - eval_x[npts * 6] = bf_x*y*y*y; - eval_x[npts * 7] = bf_x*y*y*z; - eval_x[npts * 8] = bf_x*y*z*z; - eval_x[npts * 9] = bf_x*z*z*z; - - eval_y[npts * 0] = bf_y*x*x*x; - eval_y[npts * 1] = x*x*(bf + bf_y*y); - eval_y[npts * 2] = bf_y*x*x*z; - eval_y[npts * 3] = x*y*(2*bf + bf_y*y); - eval_y[npts * 4] = x*z*(bf + bf_y*y); - eval_y[npts * 5] = bf_y*x*z*z; - eval_y[npts * 6] = y*y*(3*bf + bf_y*y); - eval_y[npts * 7] = y*z*(2*bf + bf_y*y); - eval_y[npts * 8] = z*z*(bf + bf_y*y); - eval_y[npts * 9] = bf_y*z*z*z; - - eval_z[npts * 0] = bf_z*x*x*x; - eval_z[npts * 1] = bf_z*x*x*y; - eval_z[npts * 2] = x*x*(bf + bf_z*z); - eval_z[npts * 3] = bf_z*x*y*y; - eval_z[npts * 4] = x*y*(bf + bf_z*z); - eval_z[npts * 5] = x*z*(2*bf + bf_z*z); - eval_z[npts * 6] = bf_z*y*y*y; - eval_z[npts * 7] = y*y*(bf + bf_z*z); - eval_z[npts * 8] = y*z*(2*bf + bf_z*z); - eval_z[npts * 9] = z*z*(3*bf + bf_z*z); - -} - - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - if( l == 0 ) { - - collocation_cartesian_angular_0( npts, bf, x, y, z, eval ); - - } else if( l == 1 ) { - - collocation_cartesian_angular_1( npts, bf, x, y, z, eval ); - - } else if( l == 2 ) { - - collocation_cartesian_angular_2( npts, bf, x, y, z, eval ); - - } else if( l == 3 ) { - - collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_cartesian_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - - if( l == 0 ) { - - collocation_cartesian_angular_0( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_0_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 1 ) { - - collocation_cartesian_angular_1( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_1_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 2 ) { - - collocation_cartesian_angular_2( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_2_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 3 ) { - - collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_cartesian_angular_deriv1 - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_angular_spherical_unnorm.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_angular_spherical_unnorm.hpp deleted file mode 100644 index 9de5f11..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_angular_spherical_unnorm.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace cuda { - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_0( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_0_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x; - - eval_y[npts * 0] = bf_y; - - eval_z[npts * 0] = bf_z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_1( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*y; - eval[npts * 1] = bf*z; - eval[npts * 2] = bf*x; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_1_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x*y; - eval_x[npts * 1] = bf_x*z; - eval_x[npts * 2] = bf + bf_x*x; - - eval_y[npts * 0] = bf + bf_y*y; - eval_y[npts * 1] = bf_y*z; - eval_y[npts * 2] = bf_y*x; - - eval_z[npts * 0] = bf_z*y; - eval_z[npts * 1] = bf + bf_z*z; - eval_z[npts * 2] = bf_z*x; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_2( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = sqrt_3*bf*x*y; - eval[npts * 1] = sqrt_3*bf*y*z; - eval[npts * 2] = bf*(-x*x - y*y + 2*z*z)/2; - eval[npts * 3] = sqrt_3*bf*x*z; - eval[npts * 4] = sqrt_3*bf*(x*x - y*y)/2; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_2_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = sqrt_3*y*(bf + bf_x*x); - eval_x[npts * 1] = sqrt_3*bf_x*y*z; - eval_x[npts * 2] = -bf*x - bf_x*(x*x + y*y - 2*z*z)/2; - eval_x[npts * 3] = sqrt_3*z*(bf + bf_x*x); - eval_x[npts * 4] = sqrt_3*(bf*x + bf_x*(x*x - y*y)/2); - - eval_y[npts * 0] = sqrt_3*x*(bf + bf_y*y); - eval_y[npts * 1] = sqrt_3*z*(bf + bf_y*y); - eval_y[npts * 2] = -bf*y - bf_y*(x*x + y*y - 2*z*z)/2; - eval_y[npts * 3] = sqrt_3*bf_y*x*z; - eval_y[npts * 4] = sqrt_3*(-bf*y + bf_y*(x*x - y*y)/2); - - eval_z[npts * 0] = sqrt_3*bf_z*x*y; - eval_z[npts * 1] = sqrt_3*y*(bf + bf_z*z); - eval_z[npts * 2] = 2*bf*z - bf_z*(x*x + y*y - 2*z*z)/2; - eval_z[npts * 3] = sqrt_3*x*(bf + bf_z*z); - eval_z[npts * 4] = sqrt_3*bf_z*(x*x - y*y)/2; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = sqrt_10*bf*y*(3*x*x - y*y)/4; - eval[npts * 1] = sqrt_15*bf*x*y*z; - eval[npts * 2] = sqrt_6*bf*y*(-x*x - y*y + 4*z*z)/4; - eval[npts * 3] = bf*z*(-3*x*x - 3*y*y + 2*z*z)/2; - eval[npts * 4] = sqrt_6*bf*x*(-x*x - y*y + 4*z*z)/4; - eval[npts * 5] = sqrt_15*bf*z*(x*x - y*y)/2; - eval[npts * 6] = sqrt_10*bf*x*(x*x - 3*y*y)/4; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = sqrt_10*y*(6*bf*x + bf_x*(3*x*x - y*y))/4; - eval_x[npts * 1] = sqrt_15*y*z*(bf + bf_x*x); - eval_x[npts * 2] = -sqrt_6*y*(2*bf*x + bf_x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 3] = -z*(6*bf*x + bf_x*(3*x*x + 3*y*y - 2*z*z))/2; - eval_x[npts * 4] = -sqrt_6*(bf*(3*x*x + y*y - 4*z*z) + bf_x*x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 5] = sqrt_15*z*(2*bf*x + bf_x*(x*x - y*y))/2; - eval_x[npts * 6] = sqrt_10*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; - - eval_y[npts * 0] = sqrt_10*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; - eval_y[npts * 1] = sqrt_15*x*z*(bf + bf_y*y); - eval_y[npts * 2] = -sqrt_6*(bf*(x*x + 3*y*y - 4*z*z) + bf_y*y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 3] = -z*(6*bf*y + bf_y*(3*x*x + 3*y*y - 2*z*z))/2; - eval_y[npts * 4] = -sqrt_6*x*(2*bf*y + bf_y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 5] = sqrt_15*z*(-2*bf*y + bf_y*(x*x - y*y))/2; - eval_y[npts * 6] = sqrt_10*x*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; - - eval_z[npts * 0] = sqrt_10*bf_z*y*(3*x*x - y*y)/4; - eval_z[npts * 1] = sqrt_15*x*y*(bf + bf_z*z); - eval_z[npts * 2] = sqrt_6*y*(8*bf*z - bf_z*(x*x + y*y - 4*z*z))/4; - eval_z[npts * 3] = -3*bf*(x*x + y*y - 2*z*z)/2 - bf_z*z*(3*x*x + 3*y*y - 2*z*z)/2; - eval_z[npts * 4] = sqrt_6*x*(8*bf*z - bf_z*(x*x + y*y - 4*z*z))/4; - eval_z[npts * 5] = sqrt_15*(bf + bf_z*z)*(x*x - y*y)/2; - eval_z[npts * 6] = sqrt_10*bf_z*x*(x*x - 3*y*y)/4; - -} - - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - if( l == 0 ) { - - collocation_spherical_unnorm_angular_0( npts, bf, x, y, z, eval ); - - } else if( l == 1 ) { - - collocation_spherical_unnorm_angular_1( npts, bf, x, y, z, eval ); - - } else if( l == 2 ) { - - collocation_spherical_unnorm_angular_2( npts, bf, x, y, z, eval ); - - } else if( l == 3 ) { - - collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_spherical_unnorm_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - - if( l == 0 ) { - - collocation_spherical_unnorm_angular_0( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_0_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 1 ) { - - collocation_spherical_unnorm_angular_1( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_1_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 2 ) { - - collocation_spherical_unnorm_angular_2( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_2_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 3 ) { - - collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_spherical_unnorm_angular_deriv1 - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_device_constants.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_device_constants.hpp deleted file mode 100644 index ef3fb6b..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_device_constants.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { -namespace cuda { - - constexpr double sqrt_15 = 3.872983346207417; - constexpr double sqrt_3 = 1.7320508075688772; - constexpr double sqrt_6 = 2.449489742783178; - constexpr double sqrt_10 = 3.1622776601683795; - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_radial.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_radial.hpp deleted file mode 100644 index 03d8efb..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/collocation_radial.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include - -#include - - -namespace GauXC { -namespace integrator { -namespace cuda { - -__inline__ __device__ void collocation_device_radial_eval( - const Shell& shell, - const double* pt, - double* x, - double* y, - double* z, - double* eval_device -) { - - const auto* O = shell.O_data(); - const auto* alpha = shell.alpha_data(); - const auto* coeff = shell.coeff_data(); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - *x = xc; - *y = yc; - *z = zc; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - double tmp = 0.; - for( uint32_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - *eval_device = tmp; - -} - - - -__inline__ __device__ void collocation_device_radial_eval_deriv1( - const Shell& shell, - const double* pt, - double* x, - double* y, - double* z, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z -) { - - const auto* O = shell.O_data(); - const auto* alpha = shell.alpha_data(); - const auto* coeff = shell.coeff_data(); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - *x = xc; - *y = yc; - *z = zc; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - double tmp = 0.; - double tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( uint32_t i = 0; i < nprim; ++i ) { - - const double a = alpha[i]; - const double e = coeff[i] * std::exp( - a * rsq ); - - const double ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - *eval_device = tmp; - *deval_device_x = tmp_x; - *deval_device_y = tmp_y; - *deval_device_z = tmp_z; - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/deprecated/gaueval_kernels_template.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/deprecated/gaueval_kernels_template.cu deleted file mode 100644 index c9d0a8c..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/deprecated/gaueval_kernels_template.cu +++ /dev/null @@ -1,129 +0,0 @@ -//#include -#include -#include - -#include "gaueval_kernels.hpp" -#include "gaueval_angular_cartesian.hpp" -#include "gaueval_angular_spherical.hpp" -#include "gaueval_angular_spherical_unnorm.hpp" - -namespace GauXC { - -__global__ -void gaueval_device_$(ang_name)_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const StaticShell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* O = device::array_data( shell.O ); - const auto* alpha = device::array_data( shell.alpha ); - const auto* coeff = device::array_data( shell.coeff ); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim; - double tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - double * bf_eval = eval_device + ibf + ipt*nbf; - gaueval_$(ang_name)_angular( shell.l, tmp, xc, yc, zc, bf_eval ); - - } - -} - - - -__global__ -void gaueval_device_$(ang_name)_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const StaticShell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* O = device::array_data( shell.O ); - const auto* alpha = device::array_data( shell.alpha ); - const auto* coeff = device::array_data( shell.coeff ); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim; - double tmp = 0.; - double tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const double a = alpha[i]; - const double e = coeff[i] * std::exp( - a * rsq ); - - const double ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - double * bf_eval = eval_device + ibf + ipt*nbf; - double * dx_eval = deval_device_x + ibf + ipt*nbf; - double * dy_eval = deval_device_y + ibf + ipt*nbf; - double * dz_eval = deval_device_z + ibf + ipt*nbf; - - gaueval_$(ang_name)_angular_deriv1( shell.l, tmp, tmp_x, tmp_y, tmp_z, xc, yc, zc, bf_eval, dx_eval, dy_eval, dz_eval ); - - } - - -} - - -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/deprecated/generate_bfeval.py b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/deprecated/generate_bfeval.py deleted file mode 100644 index 178a979..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/deprecated/generate_bfeval.py +++ /dev/null @@ -1,440 +0,0 @@ -import cmath -import math -import os -import re -import sys -from math import factorial as fact - -import sympy -from scipy.special import binom as binomial -from sympy import I as symb_I -from sympy import exp as symb_exp -from sympy import factorial as symb_fact -from sympy import factorial2 as symb_fact2 - - -def generate_cartesian_ls(L): - l = [] - for i in range(L + 1): - lx = L - i - for j in range(i + 1): - ly = i - j - lz = L - lx - ly - - l.append([0, 0, 0]) - - for k in range(lx - 1): - l[-1][0] = l[-1][0] + 1 - for k in range(ly - 1): - l[-1][1] = l[-1][1] + 1 - for k in range(lz - 1): - l[-1][2] = l[-1][2] + 1 - - if lx > 0: - l[-1][0] = l[-1][0] + 1 - if ly > 0: - l[-1][1] = l[-1][1] + 1 - if lz > 0: - l[-1][2] = l[-1][2] + 1 - - return l - - -def generate_spherical_coeff(l, m, lx, ly, lz): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return 0.0 - - prefactor = fact(2.0 * lx) * fact(2.0 * ly) * fact(2.0 * lz) * fact(l) - prefactor = prefactor * fact(l - abs(m)) - prefactor = prefactor / (fact(2.0 * l) * fact(lx) * fact(ly) * fact(lz)) - prefactor = prefactor / fact(l + abs(m)) - prefactor = math.sqrt(prefactor) - - term1 = 0.0 - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + binomial(l, i) * binomial(i, j) * math.pow(-1, i) * fact( - 2 * l - 2 * i - ) / fact(l - abs(m) - 2 * i) - - term1 = term1 / math.pow(2, l) / fact(l) - - m_fact = 1.0 - if m < 0: - m_fact = -1.0 - - term2 = 0.0 + 0.0j - for k in range(j + 1): - z = cmath.exp(m_fact * math.pi / 2.0 * (abs(m) - lx + 2 * k) * 1.0j) - term2 = term2 + binomial(j, k) * binomial(abs(m), lx - 2 * k) * z - - val = prefactor * term1 * term2 - - if abs(val.real) < 1e-10: - val = 0.0 + val.imag * 1j - if abs(val.imag) < 1e-10: - val = val.real - - return val - - -def generate_spherical_coeff_symb(l, m, lx, ly, lz, unnorm=False): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return sympy.Integer(0) - - j_symb = sympy.Integer(j) - l_symb = sympy.Integer(l) - m_symb = sympy.Integer(abs(m)) - lx_symb = sympy.Integer(lx) - ly_symb = sympy.Integer(ly) - lz_symb = sympy.Integer(lz) - - prefactor = ( - symb_fact(2 * lx_symb) - * symb_fact(2 * ly_symb) - * symb_fact(2 * lz_symb) - * symb_fact(l_symb) - ) - prefactor = prefactor * symb_fact(l_symb - m_symb) - prefactor = prefactor / ( - symb_fact(2 * l_symb) - * symb_fact(lx_symb) - * symb_fact(ly_symb) - * symb_fact(lz_symb) - ) - prefactor = prefactor / symb_fact(l_symb + m_symb) - - # Ed's stupid normalization convention... - if unnorm: - prefactor = ( - prefactor - * symb_fact2(2 * l - 1) - / symb_fact2(2 * lx - 1) - / symb_fact2(2 * ly - 1) - / symb_fact2(2 * lz - 1) - ) - - prefactor = sympy.sqrt(prefactor) - - term1 = sympy.Integer(0) - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + sympy.Integer(binomial(l, i)) * sympy.Integer( - binomial(i, j) - ) * sympy.Integer(math.pow(-1, i)) * symb_fact( - 2 * l_symb - sympy.Integer(2 * i) - ) / symb_fact(l_symb - m_symb - sympy.Integer(2 * i)) - - term1 = term1 / (2**l_symb) / symb_fact(l) - - m_fact_symb = sympy.Integer(1) - if m < 0: - m_fact_symb = -m_fact_symb - - term2 = sympy.Integer(0) - for k in range(j + 1): - z = sympy.exp( - m_fact_symb - * sympy.pi - / 2 - * (m_symb - lx_symb + sympy.Integer(2 * k)) - * symb_I - ) - term2 = ( - term2 - + sympy.Integer(binomial(j, k)) - * sympy.Integer(binomial(abs(m), lx - 2 * k)) - * z - ) - - return prefactor * term1 * term2 - - -def generate_cartesian_angular(ls): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - - ang = [] - - for l in ls: - ang.append(r) - for i in range(l[0]): - ang[-1] = ang[-1] * x - for i in range(l[1]): - ang[-1] = ang[-1] * y - for i in range(l[2]): - ang[-1] = ang[-1] * z - - ang[-1] = ang[-1] / r - - return ang - - -def generate_spherical_angular(L, unnorm=False): - ls = generate_cartesian_ls(L) - angs = generate_cartesian_angular(ls) - - # r = sympy.symbols( 'r' ) - sph_angs = [] - for m in range(L + 1): - tmp_p = 0 - tmp_m = 0 - for i in range(len(ls)): - l = ls[i] - ang = angs[i] - - # c = generate_spherical_coeff( L, m, l[0],l[1],l[2] ) - c = generate_spherical_coeff_symb(L, m, l[0], l[1], l[2], unnorm) - - if m == 0: - tmp_p = tmp_p + c * ang - - else: - c_p = (c + sympy.conjugate(c)) / sympy.sqrt(2) - c_m = (c - sympy.conjugate(c)) / sympy.sqrt(2) / symb_I - - tmp_p = tmp_p + c_p * ang - tmp_m = tmp_m + c_m * ang - - sph_angs.append((m, tmp_p)) - if m > 0: - sph_angs.append((-m, tmp_m)) - - sph_angs = sorted(sph_angs, key=lambda x: x[0]) - - sph_angs_bare = [] - for a in sph_angs: - sph_angs_bare.append(sympy.simplify(a[1])) - - return sph_angs_bare - - -def generate_eval_lines(L, ang): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - [bf, bf_x, bf_y, bf_z] = sympy.symbols("bf bf_x bf_y bf_z", real=True) - - bf_eval_strs = [] - bf_x_eval_strs = [] - bf_y_eval_strs = [] - bf_z_eval_strs = [] - - for j in range(len(ang)): - a = ang[j] - a_x = sympy.diff(a, x) - a_y = sympy.diff(a, y) - a_z = sympy.diff(a, z) - - bf_eval = sympy.simplify(a * bf) - bf_x_eval = sympy.simplify(a_x * bf + a * bf_x) - bf_y_eval = sympy.simplify(a_y * bf + a * bf_y) - bf_z_eval = sympy.simplify(a_z * bf + a * bf_z) - - bf_eval_str = "eval[{}] = {};".format(j, bf_eval) - bf_x_eval_str = "eval_x[{}] = {};".format(j, bf_x_eval) - bf_y_eval_str = "eval_y[{}] = {};".format(j, bf_y_eval) - bf_z_eval_str = "eval_z[{}] = {};".format(j, bf_z_eval) - - if L >= 2: - for k in range(2, L + 1): - for X in ("x", "y", "z"): - pow_str = X + "**" + str(k) - repl_str = "" - for K in range(k - 1): - repl_str = repl_str + X + "*" - repl_str = repl_str + X - - bf_eval_str = bf_eval_str.replace(pow_str, repl_str) - bf_x_eval_str = bf_x_eval_str.replace(pow_str, repl_str) - bf_y_eval_str = bf_y_eval_str.replace(pow_str, repl_str) - bf_z_eval_str = bf_z_eval_str.replace(pow_str, repl_str) - - bf_eval_strs.append(bf_eval_str) - bf_x_eval_strs.append(bf_x_eval_str) - bf_y_eval_strs.append(bf_y_eval_str) - bf_z_eval_strs.append(bf_z_eval_str) - - return (bf_eval_strs, bf_x_eval_strs, bf_y_eval_strs, bf_z_eval_strs) - - -cart_header_fname = "gaueval_angular_cartesian.hpp" -sphr_header_fname = "gaueval_angular_spherical.hpp" -cons_header_fname = "gaueval_device_constants.hpp" - -cart_header_file = open(cart_header_fname, "w") -sphr_header_file = open(sphr_header_fname, "w") -cons_header_file = open(cons_header_fname, "w") - -L_max = 4 -do_libint_norm = False -# do_libint_norm = True - -preamble = """ -#pragma once -#include "gaueval_device_constants.hpp" - -#define GPGAUEVAL_INLINE __inline__ - -namespace GauXC { -""" - - -cart_header_file.write(preamble) -sphr_header_file.write(preamble) - -cartesian_bf_template = """ -GPGAUEVAL_INLINE __device__ void generate_cartesian_angular{}( - const double bf, - const double x, - const double y, - const double z, - double* eval -) {{ -""" - -cartesian_bf_deriv1_template = """ -GPGAUEVAL_INLINE __device__ void generate_cartesian_angular{}_deriv1( - const double bf, - const double bf_x, - const double bf_y, - const double bf_z, - const double x, - const double y, - const double z, - double* eval_x, - double* eval_y, - double* eval_z -) {{ -""" - -spherical_bf_template = cartesian_bf_template.replace("cartesian", "spherical") -spherical_bf_deriv1_template = cartesian_bf_deriv1_template.replace( - "cartesian", "spherical" -) - - -constant_lines = [] -for L in range(L_max + 1): - sph_ang = generate_spherical_angular(L, do_libint_norm) - car_ang = generate_cartesian_angular(generate_cartesian_ls(L)) - - sph_bf_eval_strs, sph_bf_x_eval_strs, sph_bf_y_eval_strs, sph_bf_z_eval_strs = ( - generate_eval_lines(L, sph_ang) - ) - car_bf_eval_strs, car_bf_x_eval_strs, car_bf_y_eval_strs, car_bf_z_eval_strs = ( - generate_eval_lines(L, car_ang) - ) - - cartesian_bf_prototype = cartesian_bf_template.format("_" + str(L)) - spherical_bf_prototype = spherical_bf_template.format("_" + str(L)) - cartesian_bf_deriv1_prototype = cartesian_bf_deriv1_template.format("_" + str(L)) - spherical_bf_deriv1_prototype = spherical_bf_deriv1_template.format("_" + str(L)) - - spherical_bf_func = spherical_bf_prototype + "\n" - for s in sph_bf_eval_strs: - spherical_bf_func = spherical_bf_func + " " + s + "\n" - spherical_bf_func = spherical_bf_func + "\n}\n" - - spherical_bf_deriv1_func = spherical_bf_deriv1_prototype + "\n" - for s in sph_bf_x_eval_strs: - spherical_bf_deriv1_func = spherical_bf_deriv1_func + " " + s + "\n" - spherical_bf_deriv1_func = spherical_bf_deriv1_func + "\n" - for s in sph_bf_y_eval_strs: - spherical_bf_deriv1_func = spherical_bf_deriv1_func + " " + s + "\n" - spherical_bf_deriv1_func = spherical_bf_deriv1_func + "\n" - for s in sph_bf_z_eval_strs: - spherical_bf_deriv1_func = spherical_bf_deriv1_func + " " + s + "\n" - spherical_bf_deriv1_func = spherical_bf_deriv1_func + "\n}\n" - - cartesian_bf_func = cartesian_bf_prototype + "\n" - for s in car_bf_eval_strs: - cartesian_bf_func = cartesian_bf_func + " " + s + "\n" - cartesian_bf_func = cartesian_bf_func + "\n}\n" - - cartesian_bf_deriv1_func = cartesian_bf_deriv1_prototype + "\n" - for s in car_bf_x_eval_strs: - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + " " + s + "\n" - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + "\n" - for s in car_bf_y_eval_strs: - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + " " + s + "\n" - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + "\n" - for s in car_bf_z_eval_strs: - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + " " + s + "\n" - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func + "\n}\n" - - sqrt_regex = "sqrt\([0-9]+\)" - - sqrt_finds = re.findall(sqrt_regex, spherical_bf_func) - sqrt_finds = sqrt_finds + (re.findall(sqrt_regex, spherical_bf_deriv1_func)) - sqrt_finds = sqrt_finds + (re.findall(sqrt_regex, cartesian_bf_func)) - sqrt_finds = sqrt_finds + (re.findall(sqrt_regex, cartesian_bf_deriv1_func)) - - sqrt_finds = list(set(sqrt_finds)) - - for x in sqrt_finds: - arg = x.strip("sqrt(").strip(")") - new_str = "sqrt_" + arg - spherical_bf_func = spherical_bf_func.replace(x, new_str) - spherical_bf_deriv1_func = spherical_bf_deriv1_func.replace(x, new_str) - cartesian_bf_func = cartesian_bf_func.replace(x, new_str) - cartesian_bf_deriv1_func = cartesian_bf_deriv1_func.replace(x, new_str) - - new_str = "constexpr double " + new_str + " = " + str(math.sqrt(int(arg))) + ";" - constant_lines.append(new_str) - - cart_header_file.write(cartesian_bf_func) - cart_header_file.write(cartesian_bf_deriv1_func) - sphr_header_file.write(spherical_bf_func) - sphr_header_file.write(spherical_bf_deriv1_func) - - -# Generate calling routines -cartesian_bf_calling_func = cartesian_bf_template.format("") -spherical_bf_calling_func = spherical_bf_template.format("") -cartesian_bf_deriv1_calling_func = cartesian_bf_deriv1_template.format("") -spherical_bf_deriv1_calling_func = spherical_bf_deriv1_template.format("") - -am_dispatch_template = "switch( shell.l ) {{\n" -am_dispatch_template_deriv1 = "switch( shell.l ) {{\n" -for L in range(L_max + 1): - bf_template = """ - case {0}: - gaueval_{{0}}_angular_{0}(tmp, xc, yc, zc, bf_eval); - break; -""".format(L) - - deriv1_template = """ - case {0}: - gaueval_{{0}}_angular_{0}(tmp, xc, yc, zc, bf_eval); - gaueval_{{0}}_angular_{0}_deriv1(tmp, tmp_x, tmp_y, tmp_z, xc, yc, zc, bf_eval, bf_x_eval, bf_y_eval, bf_z_eval); - break; -""".format(L) - - am_dispatch_template = am_dispatch_template + bf_template - am_dispatch_template_deriv1 = am_dispatch_template_deriv1 + deriv1_template - - -am_dispatch_template = am_dispatch_template + "}}\n" -am_dispatch_template_deriv1 = am_dispatch_template_deriv1 + "}}\n" - -print(am_dispatch_template_deriv1.format("cartesian")) -print(am_dispatch_template_deriv1.format("spherical")) - - -footer = "} // namespace GauXC" -cart_header_file.write(footer) -sphr_header_file.write(footer) - -constant_lines = list(set(constant_lines)) -preamble = """ -#pragma once - -namespace GauXC { -""" - -cons_header_file.write(preamble) -for s in constant_lines: - cons_header_file.write(" " + s + "\n") -cons_header_file.write(footer) diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/__init__.py b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/collocation_angular.py b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/collocation_angular.py deleted file mode 100644 index 0903a5d..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/collocation_angular.py +++ /dev/null @@ -1,259 +0,0 @@ -import cmath -import math -import os -import re -import sys -from math import factorial as fact - -import sympy -from scipy.special import binom as binomial -from sympy import I as symb_I -from sympy import exp as symb_exp -from sympy import factorial as symb_fact -from sympy import factorial2 as symb_fact2 - - -def generate_cartesian_ls(L): - l = [] - for i in range(L + 1): - lx = L - i - for j in range(i + 1): - ly = i - j - lz = L - lx - ly - - l.append([0, 0, 0]) - - for k in range(lx - 1): - l[-1][0] = l[-1][0] + 1 - for k in range(ly - 1): - l[-1][1] = l[-1][1] + 1 - for k in range(lz - 1): - l[-1][2] = l[-1][2] + 1 - - if lx > 0: - l[-1][0] = l[-1][0] + 1 - if ly > 0: - l[-1][1] = l[-1][1] + 1 - if lz > 0: - l[-1][2] = l[-1][2] + 1 - - return l - - -def generate_spherical_coeff(l, m, lx, ly, lz): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return 0.0 - - prefactor = fact(2.0 * lx) * fact(2.0 * ly) * fact(2.0 * lz) * fact(l) - prefactor = prefactor * fact(l - abs(m)) - prefactor = prefactor / (fact(2.0 * l) * fact(lx) * fact(ly) * fact(lz)) - prefactor = prefactor / fact(l + abs(m)) - prefactor = math.sqrt(prefactor) - - term1 = 0.0 - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + binomial(l, i) * binomial(i, j) * math.pow(-1, i) * fact( - 2 * l - 2 * i - ) / fact(l - abs(m) - 2 * i) - - term1 = term1 / math.pow(2, l) / fact(l) - - m_fact = 1.0 - if m < 0: - m_fact = -1.0 - - term2 = 0.0 + 0.0j - for k in range(j + 1): - z = cmath.exp(m_fact * math.pi / 2.0 * (abs(m) - lx + 2 * k) * 1.0j) - term2 = term2 + binomial(j, k) * binomial(abs(m), lx - 2 * k) * z - - val = prefactor * term1 * term2 - - if abs(val.real) < 1e-10: - val = 0.0 + val.imag * 1j - if abs(val.imag) < 1e-10: - val = val.real - - return val - - -def generate_spherical_coeff_symb(l, m, lx, ly, lz, unnorm=False): - j = lx + ly - abs(m) - if j % 2 == 0: - j = int(j / 2) - else: - return sympy.Integer(0) - - j_symb = sympy.Integer(j) - l_symb = sympy.Integer(l) - m_symb = sympy.Integer(abs(m)) - lx_symb = sympy.Integer(lx) - ly_symb = sympy.Integer(ly) - lz_symb = sympy.Integer(lz) - - prefactor = ( - symb_fact(2 * lx_symb) - * symb_fact(2 * ly_symb) - * symb_fact(2 * lz_symb) - * symb_fact(l_symb) - ) - prefactor = prefactor * symb_fact(l_symb - m_symb) - prefactor = prefactor / ( - symb_fact(2 * l_symb) - * symb_fact(lx_symb) - * symb_fact(ly_symb) - * symb_fact(lz_symb) - ) - prefactor = prefactor / symb_fact(l_symb + m_symb) - - # Ed's stupid normalization convention... - if unnorm: - prefactor = ( - prefactor - * symb_fact2(2 * l - 1) - / symb_fact2(2 * lx - 1) - / symb_fact2(2 * ly - 1) - / symb_fact2(2 * lz - 1) - ) - - prefactor = sympy.sqrt(prefactor) - - term1 = sympy.Integer(0) - for i in range(int((l - abs(m)) / 2) + 1): - term1 = term1 + sympy.Integer(binomial(l, i)) * sympy.Integer( - binomial(i, j) - ) * sympy.Integer(math.pow(-1, i)) * symb_fact( - 2 * l_symb - sympy.Integer(2 * i) - ) / symb_fact(l_symb - m_symb - sympy.Integer(2 * i)) - - term1 = term1 / (2**l_symb) / symb_fact(l) - - m_fact_symb = sympy.Integer(1) - if m < 0: - m_fact_symb = -m_fact_symb - - term2 = sympy.Integer(0) - for k in range(j + 1): - z = sympy.exp( - m_fact_symb - * sympy.pi - / 2 - * (m_symb - lx_symb + sympy.Integer(2 * k)) - * symb_I - ) - term2 = ( - term2 - + sympy.Integer(binomial(j, k)) - * sympy.Integer(binomial(abs(m), lx - 2 * k)) - * z - ) - - return prefactor * term1 * term2 - - -def generate_cartesian_angular(ls): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - - ang = [] - - for l in ls: - ang.append(r) - for i in range(l[0]): - ang[-1] = ang[-1] * x - for i in range(l[1]): - ang[-1] = ang[-1] * y - for i in range(l[2]): - ang[-1] = ang[-1] * z - - ang[-1] = ang[-1] / r - - return ang - - -def generate_spherical_angular(L, unnorm=False): - ls = generate_cartesian_ls(L) - angs = generate_cartesian_angular(ls) - - # r = sympy.symbols( 'r' ) - sph_angs = [] - for m in range(L + 1): - tmp_p = 0 - tmp_m = 0 - for i in range(len(ls)): - l = ls[i] - ang = angs[i] - - # c = generate_spherical_coeff( L, m, l[0],l[1],l[2] ) - c = generate_spherical_coeff_symb(L, m, l[0], l[1], l[2], unnorm) - - if m == 0: - tmp_p = tmp_p + c * ang - - else: - c_p = (c + sympy.conjugate(c)) / sympy.sqrt(2) - c_m = (c - sympy.conjugate(c)) / sympy.sqrt(2) / symb_I - - tmp_p = tmp_p + c_p * ang - tmp_m = tmp_m + c_m * ang - - sph_angs.append((m, tmp_p)) - if m > 0: - sph_angs.append((-m, tmp_m)) - - sph_angs = sorted(sph_angs, key=lambda x: x[0]) - - sph_angs_bare = [] - for a in sph_angs: - sph_angs_bare.append(sympy.simplify(a[1])) - - return sph_angs_bare - - -def generate_eval_lines(L, ang): - [x, y, z, r] = sympy.symbols("x y z r", real=True) - [bf, bf_x, bf_y, bf_z] = sympy.symbols("bf bf_x bf_y bf_z", real=True) - - bf_eval_strs = [] - bf_x_eval_strs = [] - bf_y_eval_strs = [] - bf_z_eval_strs = [] - - for j in range(len(ang)): - a = ang[j] - a_x = sympy.diff(a, x) - a_y = sympy.diff(a, y) - a_z = sympy.diff(a, z) - - bf_eval = sympy.simplify(a * bf) - bf_x_eval = sympy.simplify(a_x * bf + a * bf_x) - bf_y_eval = sympy.simplify(a_y * bf + a * bf_y) - bf_z_eval = sympy.simplify(a_z * bf + a * bf_z) - - bf_eval_str = "eval[npts * {}] = {};".format(j, bf_eval) - bf_x_eval_str = "eval_x[npts * {}] = {};".format(j, bf_x_eval) - bf_y_eval_str = "eval_y[npts * {}] = {};".format(j, bf_y_eval) - bf_z_eval_str = "eval_z[npts * {}] = {};".format(j, bf_z_eval) - - if L >= 2: - for k in range(2, L + 1): - for X in ("x", "y", "z"): - pow_str = X + "**" + str(k) - repl_str = "" - for K in range(k - 1): - repl_str = repl_str + X + "*" - repl_str = repl_str + X - - bf_eval_str = bf_eval_str.replace(pow_str, repl_str) - bf_x_eval_str = bf_x_eval_str.replace(pow_str, repl_str) - bf_y_eval_str = bf_y_eval_str.replace(pow_str, repl_str) - bf_z_eval_str = bf_z_eval_str.replace(pow_str, repl_str) - - bf_eval_strs.append(bf_eval_str) - bf_x_eval_strs.append(bf_x_eval_str) - bf_y_eval_strs.append(bf_y_eval_str) - bf_z_eval_strs.append(bf_z_eval_str) - - return (bf_eval_strs, bf_x_eval_strs, bf_y_eval_strs, bf_z_eval_strs) diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/generate_collocation_angular_eval.py b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/generate_collocation_angular_eval.py deleted file mode 100644 index af14e58..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/scripts/generate_collocation_angular_eval.py +++ /dev/null @@ -1,163 +0,0 @@ -import math -import os -import re -import sys -from io import StringIO - -import pyexpander.lib as expander -from collocation_angular import ( - generate_cartesian_angular, - generate_cartesian_ls, - generate_eval_lines, - generate_spherical_angular, -) - -L_max = 6 -if len(sys.argv) > 1: - L_max = int(sys.argv[1]) - -# sphr_bf_body = [] -# sphr_bf_d1_body = [] - -sphr_unnorm_bf_body = [] -sphr_unnorm_bf_d1_body = [] - -cart_bf_body = [] -cart_bf_d1_body = [] - - -for L in range(L_max + 1): - print("Processing L = {} ...".format(L)) - # sphr_ang = generate_spherical_angular( L, False ) - sphr_unnorm_ang = generate_spherical_angular(L, True) - cart_ang = generate_cartesian_angular(generate_cartesian_ls(L)) - - # sa, sa_x, sa_y, sa_z = generate_eval_lines( L, sphr_ang ) - sna, sna_x, sna_y, sna_z = generate_eval_lines(L, sphr_unnorm_ang) - ca, ca_x, ca_y, ca_z = generate_eval_lines(L, cart_ang) - - # sphr_bf_body.append( "\n ".join(sa) ) - sphr_unnorm_bf_body.append("\n ".join(sna)) - cart_bf_body.append("\n ".join(ca)) - - # s_d1 = "\n\n ".join(["\n ".join( sa_x ), "\n ".join(sa_y), "\n ".join(sa_z)]) - sn_d1 = "\n\n ".join(["\n ".join(sna_x), "\n ".join(sna_y), "\n ".join(sna_z)]) - c_d1 = "\n\n ".join(["\n ".join(ca_x), "\n ".join(ca_y), "\n ".join(ca_z)]) - - # sphr_bf_d1_body.append( s_d1 ) - sphr_unnorm_bf_d1_body.append(sn_d1) - cart_bf_d1_body.append(c_d1) - - -template_fname = "templates/collocation_angular_template.hpp" - -# sphr_var_dict = { 'L_max' : L_max, 'body' : sphr_bf_body, 'body_d1' : sphr_bf_d1_body, 'name' : 'spherical' } -sphr_unnorm_var_dict = { - "L_max": L_max, - "body": sphr_unnorm_bf_body, - "body_d1": sphr_unnorm_bf_d1_body, - "name": "spherical_unnorm", -} -cart_var_dict = { - "L_max": L_max, - "body": cart_bf_body, - "body_d1": cart_bf_d1_body, - "name": "cartesian", -} - - -old_sys_out = sys.stdout - -sys.stdout = cart_expand = StringIO() -expander.expandFile( - template_fname, external_definitions=cart_var_dict, auto_indent=True -) -# sys.stdout = sphr_expand = StringIO() -# expander.expandFile( template_fname, external_definitions=sphr_var_dict, auto_indent=True ) -sys.stdout = sphr_unnorm_expand = StringIO() -expander.expandFile( - template_fname, external_definitions=sphr_unnorm_var_dict, auto_indent=True -) - -sys.stdout = old_sys_out - -cart_expand = cart_expand.getvalue() -# sphr_expand = sphr_expand.getvalue() -sphr_unnorm_expand = sphr_unnorm_expand.getvalue() - - -# Handle Constants -constant_lines = [] - -# Sqrts -sqrt_regex = "sqrt\([0-9]+\)" -# sqrt_finds = re.findall( sqrt_regex, "\n".join([cart_expand,sphr_expand,sphr_unnorm_expand]) ) -sqrt_finds = re.findall(sqrt_regex, "\n".join([cart_expand, sphr_unnorm_expand])) - -sqrt_finds = list(set(sqrt_finds)) - -for x in sqrt_finds: - arg = x.strip("sqrt(").strip(")") - new_str = "sqrt_" + arg - - cart_expand = cart_expand.replace(x, new_str) - # sphr_expand = sphr_expand.replace( x, new_str ) - sphr_unnorm_expand = sphr_unnorm_expand.replace(x, new_str) - - new_str = "constexpr double " + new_str + " = " + str(math.sqrt(int(arg))) + ";" - constant_lines.append(new_str) - -old_sys_out = sys.stdout - -sys.stdout = constant_expand = StringIO() -expander.expandFile( - "templates/collocation_device_constants_template.hpp", - external_definitions={"const_lines": constant_lines}, -) - -sys.stdout = old_sys_out - -constant_expand = constant_expand.getvalue() - - -cart_header_fname = "collocation_angular_cartesian.hpp" -# sphr_header_fname = "collocation_angular_spherical.hpp" -sphr_unnorm_header_fname = "collocation_angular_spherical_unnorm.hpp" -cons_header_fname = "collocation_device_constants.hpp" - -cart_header_file = open(cart_header_fname, "w") -# sphr_header_file = open( sphr_header_fname, 'w' ) -sphr_unnorm_header_file = open(sphr_unnorm_header_fname, "w") -cons_header_file = open(cons_header_fname, "w") - -cart_header_file.write(cart_expand) -# sphr_header_file.write( sphr_expand ) -sphr_unnorm_header_file.write(sphr_unnorm_expand) -cons_header_file.write(constant_expand) - - -# Generate Kernel Driver - -# old_sys_out = sys.stdout - -# sys.stdout = collocation_cartesian_kernel_expand = StringIO() -# expander.expandFile( 'collocation_kernels_template.cu', external_definitions={ 'ang_name' : 'cartesian' } ) -# -# sys.stdout = collocation_spherical_kernel_expand = StringIO() -# expander.expandFile( 'collocation_kernels_template.cu', external_definitions={ 'ang_name' : 'spherical' } ) -# -# sys.stdout = collocation_spherical_unnorm_kernel_expand = StringIO() -# expander.expandFile( 'collocation_kernels_template.cu', external_definitions={ 'ang_name' : 'spherical_unnorm' } ) -# -# sys.stdout = old_sys_out -# -# collocation_cartesian_kernel_expand = collocation_cartesian_kernel_expand.getvalue() -# collocation_spherical_kernel_expand = collocation_spherical_kernel_expand.getvalue() -# collocation_spherical_unnorm_kernel_expand = collocation_spherical_unnorm_kernel_expand.getvalue() -# -# with open( 'collocation_kernels_cartesian.cu', 'w' ) as f: -# f.write( collocation_cartesian_kernel_expand ) -# with open( 'collocation_kernels_spherical.cu', 'w' ) as f: -# f.write( collocation_spherical_kernel_expand ) -# with open( 'collocation_kernels_spherical_unnorm.cu', 'w' ) as f: -# f.write( collocation_spherical_unnorm_kernel_expand ) diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/templates/collocation_angular_template.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/templates/collocation_angular_template.hpp deleted file mode 100644 index f07b758..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/templates/collocation_angular_template.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace cuda { - -$for( L in range(L_max + 1) )\ -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular_$(L)( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - $(body[L]) - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular_$(L)_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - $(body_d1[L]) - -} - -$endfor\ - -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - -$for( L in range(L_max + 1) )\ - $if( L == 0 )\ - if( l == $(L) ) { - $else\ - } else if( l == $(L) ) { - $endif - collocation_$(name)_angular_$(L)( npts, bf, x, y, z, eval ); - -$endfor\ - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_$(name)_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_$(name)_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - -$for( L in range(L_max + 1) )\ - $if( L == 0 )\ - if( l == $(L) ) { - $else\ - } else if( l == $(L) ) { - $endif - collocation_$(name)_angular_$(L)( npts, bf, x, y, z, eval ); - collocation_$(name)_angular_$(L)_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - -$endfor\ - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_$(name)_angular_deriv1 - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/templates/collocation_device_constants_template.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/templates/collocation_device_constants_template.hpp deleted file mode 100644 index 5245913..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation/templates/collocation_device_constants_template.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { -namespace cuda { - -$for( x in const_lines )\ - $(x) -$endfor\ - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_device.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_device.cu deleted file mode 100644 index 27fee96..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_device.cu +++ /dev/null @@ -1,382 +0,0 @@ -#include -#include -#include "exceptions/cuda_exception.hpp" -#include - -#include "device/cuda/collocation_petite_kernels.hpp" -#include "device/cuda/collocation_masked_kernels.hpp" -#include "device/cuda/collocation_petite_combined_kernels.hpp" -#include "device/cuda/collocation_masked_combined_kernels.hpp" - -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -) { - - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_petite_kernel - <<>> - ( nshells, nbf, npts, shells_device, offs_device, - pts_device, eval_device ); - -} - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - cudaStream_t stream -); - - - - - - - - - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_masked_kernel - <<>> - ( nshells, nbf, npts, shells_device, mask_device, - offs_device, pts_device, eval_device ); - -} - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - cudaStream_t stream -); - - - - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_petite_combined_kernel - <<>> - ( ntasks, device_tasks ); - -} - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - - - - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_masked_combined_kernel - <<>> - ( ntasks, shells_device, device_tasks ); - -} - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -) { - - auto nmax_threads = util::cuda_kernel_max_threads_per_block( - collocation_device_petite_kernel_deriv1 - ); - - dim3 threads(warp_size, nmax_threads/warp_size, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_petite_kernel_deriv1 - <<>> - ( nshells, nbf, npts, shells_device, offs_device, - pts_device, eval_device, deval_device_x, deval_device_y, - deval_device_z ); - -} - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z, - cudaStream_t stream -); - - - - - - - - - - - - - - - - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - collocation_device_masked_kernel_deriv1 - <<>> - ( nshells, nbf, npts, shells_device, mask_device, offs_device, - pts_device, eval_device, deval_device_x, deval_device_y, - deval_device_z ); - -} - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z, - cudaStream_t stream -); - - - - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_petite_combined_kernel_deriv1 - <<>> - ( ntasks, device_tasks ); - -} - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -) { - - auto nmax_threads = util::cuda_kernel_max_threads_per_block( - collocation_device_masked_combined_kernel_deriv1 - ); - - dim3 threads(warp_size, nmax_threads/warp_size, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - collocation_device_masked_combined_kernel_deriv1 - <<>> - ( ntasks, shells_device, device_tasks ); - -} - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - - - - - - - - - - - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_device.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_device.hpp deleted file mode 100644 index 9a8957b..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_device.hpp +++ /dev/null @@ -1,109 +0,0 @@ -#pragma once -#include -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -); - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - cudaStream_t stream -); - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -); - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - cudaStream_t stream -); - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - - - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - cudaStream_t stream -); - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_masked_combined_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_masked_combined_kernels.hpp deleted file mode 100644 index f7cdebd..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_masked_combined_kernels.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#include -#include - -#include -#include - -#include "device/cuda/collocation/collocation_angular_cartesian.hpp" -#include "device/cuda/collocation/collocation_angular_spherical_unnorm.hpp" -#include "device/cuda/cuda_alg_variant_control.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -__global__ -void collocation_device_masked_combined_kernel( - size_t ntasks, - Shell* __restrict__ shells_device, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( blockIdx.z < ntasks ) { - - auto& task = device_tasks[ blockIdx.z ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ mask_device = task.shell_list; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - - if( tid_x < npts and tid_y < nshells ) { - - const uint32_t ipt = tid_x; - const uint32_t ish = tid_y; - const uint32_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - auto tmp = 0.; - for( uint32_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } // shell / point idx check - - } // Batch idx check - -} - - - - - - - - - - - - - - - -template -__global__ -void collocation_device_masked_combined_kernel_deriv1( - size_t ntasks, - Shell* __restrict__ shells_device, - XCTaskDevice* __restrict__ device_tasks -) { - - // DBWY: These are factored into the loop for this optimization - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( blockIdx.z < ntasks ) { - - auto& task = device_tasks[ blockIdx.z ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ mask_device = task.shell_list; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - auto* __restrict__ deval_device_x = task.dbfx; - auto* __restrict__ deval_device_y = task.dbfy; - auto* __restrict__ deval_device_z = task.dbfz; - - if( tid_y < nshells and tid_x < npts ) { - - const uint32_t ish = tid_y; - const uint32_t ipt = tid_x; - const uint32_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( uint32_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, - tmp_z, xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } // shell / point idx check - } // Batch idx check - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_masked_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_masked_kernels.hpp deleted file mode 100644 index 8db2bfe..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_masked_kernels.hpp +++ /dev/null @@ -1,155 +0,0 @@ -#include -#include - -#include - -#include "device/cuda/collocation/collocation_angular_cartesian.hpp" -#include "device/cuda/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - - -template -__global__ -void collocation_device_masked_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ mask_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } - -} - - - - - - - - -template -__global__ -void collocation_device_masked_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ mask_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device, - T* __restrict__ deval_device_x, - T* __restrict__ deval_device_y, - T* __restrict__ deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_petite_combined_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_petite_combined_kernels.hpp deleted file mode 100644 index a120613..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_petite_combined_kernels.hpp +++ /dev/null @@ -1,186 +0,0 @@ -#include -#include - -#include -#include - -#include "device/cuda/collocation/collocation_angular_cartesian.hpp" -#include "device/cuda/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -__global__ -void collocation_device_petite_combined_kernel( - size_t ntasks, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - const int batch_id = blockIdx.z; - - if( batch_id < ntasks ) { - - auto& task = device_tasks[ batch_id ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ shells_device = task.shells; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* eval_device = task.bf; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } // shell / point idx check - - } // Batch idx check - -} - - - - - - - - - - - - - - - -template -__global__ -void collocation_device_petite_combined_kernel_deriv1( - size_t ntasks, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - const int batch_id = blockIdx.z; - - if( batch_id < ntasks ) { - - auto& task = device_tasks[ batch_id ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ shells_device = task.shells; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - auto* __restrict__ deval_device_x = task.dbfx; - auto* __restrict__ deval_device_y = task.dbfy; - auto* __restrict__ deval_device_z = task.dbfz; - - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, - tmp_z, xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } // shell / point idx check - - } // Batch idx check - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_petite_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_petite_kernels.hpp deleted file mode 100644 index 475a9df..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/collocation_petite_kernels.hpp +++ /dev/null @@ -1,160 +0,0 @@ -#include -#include - -#include - -#include "device/cuda/collocation/collocation_angular_cartesian.hpp" -#include "device/cuda/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - - - -template -__global__ -void collocation_device_petite_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } - -} - - - - - - - - - - - - - - - -template -__global__ -void collocation_device_petite_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device, - T* __restrict__ deval_device_x, - T* __restrict__ deval_device_y, - T* __restrict__ deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } - - -} - -} // namespace cuda -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cublas_extensions.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/cublas_extensions.cu deleted file mode 100644 index 17c9cdb..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cublas_extensions.cu +++ /dev/null @@ -1,153 +0,0 @@ -#include "device/cuda/cublas_extensions.hpp" -#include -#include -#include "exceptions/cublas_exception.hpp" - -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace cuda { -namespace blas { - -using namespace GauXC::cuda; - -template -__global__ void increment_kernel( const T* X, T* Y ) { - const auto tid = blockIdx.x; - if( tid < 1 ) (*Y) += (*X); -} - -template -void increment( const T* X, T* Y, cudaStream_t stream ) { - increment_kernel<<<1,1,0,stream>>>(X,Y); -} - -template <> -void dot( cublasHandle_t handle, - int N, - const double* X, - int INCX, - const double* Y, - int INCY, - double* RES ) { - - auto stat = cublasDdot( handle, N, X, INCX, Y, INCY, RES ); - GAUXC_CUBLAS_ERROR("CUBLAS DDOT FAILED", stat ); - -} - -template -void gdot( cublasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* SCR, - T* RES ) { - - dot( handle, N, X, INCX, Y, INCY, SCR ); - auto stream = util::get_stream(handle); - increment( SCR, RES, stream ); - -} - -template -void gdot( cublasHandle_t handle, - int N, - const double* X, - int INCX, - const double* Y, - int INCY, - double* SCR, - double* RES ); - - - - - - - - - - -template -void __global__ hadamard_product_kernel( int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < M and tid_y < N ) { - B[ tid_x + tid_y*LDB ] *= A[ tid_x + tid_y*LDA ]; - } - -} - - - -template -void hadamard_product( cublasHandle_t handle, - int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ) { - - auto stream = util::get_stream(handle); - dim3 threads(warp_size, max_warps_per_thread_block); - dim3 blocks( util::div_ceil( M, threads.x ), - util::div_ceil( N, threads.y ) ); - - hadamard_product_kernel<<< blocks, threads, 0, stream >>>( M, N, A, LDA, B, LDB ); - -} - -template -void hadamard_product( cublasHandle_t handle, - int M, - int N, - const double* A, - int LDA, - double* B, - int LDB ); - - - - -template <> -void gemm( cublasHandle_t handle, - cublasOperation_t TA, cublasOperation_t TB, - int M, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, - double BETA, double* C, int LDC ) { - - auto stat = cublasDgemm( handle, TA, TB, M, N, K, &ALPHA, A, LDA, - B, LDB, &BETA, C, LDC ); - GAUXC_CUBLAS_ERROR("CUBLAS DGEMM FAILED", stat); - -} - - -template <> -void syr2k( cublasHandle_t handle, - cublasFillMode_t UPLO, cublasOperation_t Trans, - int M, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, - double BETA, double* C, int LDC ) { - - auto stat = cublasDsyr2k( handle, UPLO, Trans, M, K, &ALPHA, A, LDA, B, LDB, - &BETA, C, LDC ); - GAUXC_CUBLAS_ERROR("CUBLAS DSYR2K FAILED", stat); - -} - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cublas_extensions.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cublas_extensions.hpp deleted file mode 100644 index 81af06d..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cublas_extensions.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace cuda { -namespace blas { - -template -void dot( cublasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* RES ); - -template -void gdot( cublasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* SCR, - T* RES ); - - -template -void hadamard_product( cublasHandle_t handle, - int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ); - - -template -void gemm( cublasHandle_t handle, - cublasOperation_t TA, cublasOperation_t TB, - int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, - T BETA, T* C, int LDC ); - -template -void syr2k( cublasHandle_t handle, - cublasFillMode_t UPLO, cublasOperation_t Trans, - int M, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, - T BETA, T* C, int LDC ); -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_alg_variant_control.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_alg_variant_control.hpp deleted file mode 100644 index e0d1f9b..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_alg_variant_control.hpp +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -//#define GAUXC_CUDA_ENABLE_COLLOCATION_SHMEM_COPY -//#define GAUXC_CUDA_ENABLE_COMPACT_COLLOCATION diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_device_properties.cxx b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_device_properties.cxx deleted file mode 100644 index af50a87..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_device_properties.cxx +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "cuda_runtime.h" - -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace cuda { - - -uint32_t get_submat_cut_block(int32_t LDA, int32_t device) { - int l2_cache_size; - cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device); - - int l2_block_size = (int) sqrt(0.75 * ((double) l2_cache_size / 8)); - int min_block_size = LDA / max_submat_blocks; - - int block_size = std::max(l2_block_size, min_block_size); - block_size = std::min(block_size, LDA); - - return block_size; -} - -uint32_t get_device_sm_count(int32_t device) { - int num_sm; - cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, device); - - return num_sm; -} - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_device_properties.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_device_properties.hpp deleted file mode 100644 index 0b80a00..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_device_properties.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace cuda { - -static constexpr uint32_t warp_size = 32; -static constexpr uint32_t max_threads_per_thread_block = 1024; -static constexpr uint32_t max_warps_per_thread_block = - max_threads_per_thread_block / warp_size; - -static constexpr uint32_t max_submat_blocks = 10; - -// Properties for weight algorithm -static constexpr uint32_t weight_unroll = 4; -static_assert(weight_unroll == 4, "Weight unroll is only tested for value of 4"); -static constexpr uint32_t weight_thread_block = 640; -static constexpr uint32_t weight_thread_block_per_sm = 2; - -uint32_t get_submat_cut_block(int32_t LDA, int32_t device); -uint32_t get_device_sm_count(int32_t device); -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_eval_denvars.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_eval_denvars.cu deleted file mode 100644 index 8136343..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_eval_denvars.cu +++ /dev/null @@ -1,254 +0,0 @@ -#include "device/cuda/cuda_eval_denvars.hpp" -#include "device/cuda/cuda_extensions.hpp" -#include - -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -__global__ void eval_uvars_lda_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.nbe; - - auto* den_eval_device = task.den; - - const auto* basis_eval_device = task.bf; - - const auto* den_basis_prod_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - register double den_reg = 0.; - - if( tid_x < nbf and tid_y < npts ) { - - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; - - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - - } - - // Warp blocks are stored col major - den_reg = 2 * warpReduceSum( den_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - } - - -} - - - -#define GGA_KERNEL_SM_BLOCK_Y 32 - -template -__global__ void eval_uvars_gga_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.nbe; - - auto* den_eval_device = task.den; - auto* den_x_eval_device = task.ddenx; - auto* den_y_eval_device = task.ddeny; - auto* den_z_eval_device = task.ddenz; - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - const auto* den_basis_prod_device = task.zmat; - - __shared__ double den_shared[4][warp_size][GGA_KERNEL_SM_BLOCK_Y+1]; - - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; - bid_y < npts; - bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - den_shared[3][threadIdx.x][sm_y] = 0.; - - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; - - den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - __syncthreads(); - - - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - register double den_reg = den_shared[0][sm_y][threadIdx.x]; - register double dx_reg = den_shared[1][sm_y][threadIdx.x]; - register double dy_reg = den_shared[2][sm_y][threadIdx.x]; - register double dz_reg = den_shared[3][sm_y][threadIdx.x]; - - // Warp blocks are stored col major - den_reg = 2 * warpReduceSum( den_reg ); - dx_reg = 4 * warpReduceSum( dx_reg ); - dy_reg = 4 * warpReduceSum( dy_reg ); - dz_reg = 4 * warpReduceSum( dz_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - atomicAdd( den_x_eval_device + tid_y, dx_reg ); - atomicAdd( den_y_eval_device + tid_y, dy_reg ); - atomicAdd( den_z_eval_device + tid_y, dz_reg ); - } - } - __syncthreads(); - } - } -} - - -template -__global__ void eval_vvars_gga_kernel( - size_t npts, - const T* den_x_eval_device, - const T* den_y_eval_device, - const T* den_z_eval_device, - T* gamma_eval_device -) { - - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - if( tid < npts ) { - - const double dx = den_x_eval_device[ tid ]; - const double dy = den_y_eval_device[ tid ]; - const double dz = den_z_eval_device[ tid ]; - - gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz; - - } - -} - - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( max_nbf , threads.x ), - util::div_ceil( max_npts , threads.y ), - ntasks ); - - eval_uvars_lda_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - dim3 threads( warp_size, max_warps_per_thread_block / 2, 1 ); - dim3 blocks( std::min(int64_t(4), util::div_ceil( max_nbf, 4 )), - std::min(int64_t(16), util::div_ceil( max_nbf, 16 )), - ntasks ); - - eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} - - -template -void eval_vvars_gga_device( size_t npts, - const T* den_x_device, - const T* den_y_device, - const T* den_z_device, - T* gamma_device, - cudaStream_t stream ) { - - dim3 threads( max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts, threads.x ) ); - - eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>( - npts, den_x_device, den_y_device, den_z_device, gamma_device - ); - -} - - - - - - - - - - - - - - - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void eval_vvars_gga_device( size_t npts, - const double* den_x_device, - const double* den_y_device, - const double* den_z_device, - double* gamma_device, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_eval_denvars.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_eval_denvars.hpp deleted file mode 100644 index e08874f..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_eval_denvars.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - - -template -void eval_vvars_gga_device( size_t npts, - const T* den_x_device, - const T* den_y_device, - const T* den_z_device, - T* gamma_device, - cudaStream_t stream ); - - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_extensions.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_extensions.hpp deleted file mode 100644 index f3170f3..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_extensions.hpp +++ /dev/null @@ -1,109 +0,0 @@ -#pragma once -#include -#include -#include "device/cuda/cuda_device_properties.hpp" - -#define GAUXC_ENABLE_WARP_REDUCTIONS - -namespace GauXC { -namespace cuda { - -__inline__ __device__ -double warpReduceSum(double val) { - -#ifdef GAUXC_ENABLE_WARP_REDUCTIONS - - for(int i=(warp_size/2); i>=1; i/=2) - val += __shfl_xor_sync(0xffffffff, val, i, warp_size); - -#else - - using warp_reducer = cub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage temp_storage[max_warps_per_thread_block]; - int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - int warp_lane = tid / warp_size; - val = warp_reducer( temp_storage[warp_lane] ).Sum( val ); - -#endif - - return val; -} - -__inline__ __device__ -double warpReduceProd(double val) { - for(int i=(warp_size/2); i>=1; i/=2) - val *= __shfl_xor_sync(0xffffffff, val, i, warp_size); - return val; -} - -#if 0 -__inline__ __device__ -double blockReduceSum( double val ) { - - static __shared__ double shared[32]; - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; - - val = warpReduceSum( val ); - - if( lane == 0 ) shared[wid] = val; - - __syncthreads(); - - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0; - if( wid == 0 ) val = warpReduceSum( val ); - - return val; - -} - -template -__inline__ __device__ T warp_prod_reduce( T val ) { - - for( int i = warp_size / 2; i >= 1; i /= 2 ) - val *= __shfl_xor_sync( 0xffffffff, val, i, warp_size ); - - return val; - -} - -template -__inline__ __device__ T block_prod_reduce( T val ) { - - static __shared__ T shared[32]; - const int lane = threadIdx.x % 32; - const int wid = threadIdx.x / 32; - - val = warp_prod_reduce( val ); - - if( lane == 0 ) shared[ wid ] = val; - __syncthreads(); - - val = ( threadIdx.x < blockDim.x / 32 ) ? shared[ lane ] : 0; - if( wid == 0 ) val = warp_prod_reduce( val ); - - return val; - -} - -__inline__ __device__ double atomicMul(double* address, double val) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val * - __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_inc_potential.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_inc_potential.cu deleted file mode 100644 index 2a50bdf..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_inc_potential.cu +++ /dev/null @@ -1,123 +0,0 @@ -#include "device/cuda/cuda_inc_potential.hpp" -#include "device/cuda/cuda_device_properties.hpp" -#include - -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - - -#define WARP_X 16 -#define WARP_Y 1 -#define UNROLL_FACTOR 4 -#define EFF_UNROLL 4 -#define CUT_X 8 -#define CUT_Y 8 - - -template -__global__ __launch_bounds__(1024, 1) -void inc_by_submat_combined_kernel( size_t ntasks, - XCTaskDevice* device_tasks, - T* A, - size_t LDA, - const int block_y, - const int block_x ) { - - const int batch_id = blockIdx.z; - auto& task = device_tasks[ batch_id ]; - - const auto* submat_cut_device = task.submat_cut; - const auto* submat_block_device = task.submat_block; - const auto LDAS = task.nbe; - auto* ASmall_device = task.nbe_scr; - - //if( LDAS == LDAB ) return; - const int tid_xx = threadIdx.x % WARP_X; - const int tid_xy = threadIdx.x / WARP_X; - - const int tid_yx = threadIdx.y % CUT_X; - const int tid_yy = threadIdx.y / CUT_X; - - const int start_cut_y = submat_block_device[block_y]; - const int end_cut_y = submat_block_device[block_y+1]; - const int start_cut_x = submat_block_device[block_x]; - const int end_cut_x = submat_block_device[block_x+1]; - - for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) { - const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut)); - const int i_cut_first = i_data.x; - const int delta_i = i_data.y; - const int i_cut_small = i_data.z; - - for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) { - const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut)); - const int j_cut_first = j_data.x; - const int delta_j = j_data.y; - const int j_cut_small = j_data.z; - - auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS; - auto* ABig_begin = A + i_cut_first + j_cut_first*LDA; - - int J; - for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - - double val[UNROLL_FACTOR]; - double* address[UNROLL_FACTOR]; -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - val[k] = ASmall_begin[I + (J+k*WARP_Y)*LDAS]; - address[k] = ABig_begin + I + (J+k*WARP_Y)*LDA; - } -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - atomicAdd(address[k], val[k] ); - } - } - } - - for ( ; J < delta_j; J += WARP_Y) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - atomicAdd(ABig_begin + I + J*LDA, ASmall_begin[I + J*LDAS] ); - } - } - - } - } -} - - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - T* V_device, - size_t LDV, - cudaStream_t stream ) { - dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks); - - const int submat_block_size = get_submat_cut_block(LDV, 0); - for (int i = 0; i < util::div_ceil(LDV, submat_block_size); i++) { - for (int j = 0; j < util::div_ceil(LDV, submat_block_size); j++) { - inc_by_submat_combined_kernel<<< blocks, threads, 0, stream >>>( - ntasks, device_tasks, V_device, LDV, i, j - ); - } - } -} - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - double* V_device, - size_t LDV, - cudaStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_inc_potential.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_inc_potential.hpp deleted file mode 100644 index 53d7b06..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_inc_potential.hpp +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - T* V_device, - size_t LDV, - cudaStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_pack_density.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_pack_density.cu deleted file mode 100644 index 24b246b..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_pack_density.cu +++ /dev/null @@ -1,127 +0,0 @@ -#include "device/cuda/cuda_pack_density.hpp" -#include "device/cuda/cuda_device_properties.hpp" -#include - -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -#define WARP_X 16 -#define WARP_Y 1 -#define UNROLL_FACTOR 4 -#define EFF_UNROLL 4 -#define CUT_X 8 -#define CUT_Y 8 - -template -__global__ __launch_bounds__(1024, 1) -void submat_set_combined_kernel( size_t ntasks, - XCTaskDevice* device_tasks, - T* A, - size_t LDA, - const int block_y, - const int block_x) { - - - const int batch_id = blockIdx.z; - auto& task = device_tasks[ batch_id ]; - - const auto* submat_cut_device = task.submat_cut; - const auto* submat_block_device = task.submat_block; - const auto LDAS = task.nbe; - auto* ASmall_device = task.nbe_scr; - - //if( LDAS == LDAB ) return; - - const int tid_xx = threadIdx.x % WARP_X; - const int tid_xy = threadIdx.x / WARP_X; - - const int tid_yx = threadIdx.y % CUT_X; - const int tid_yy = threadIdx.y / CUT_X; - - const int start_cut_y = submat_block_device[block_y]; - const int end_cut_y = submat_block_device[block_y+1]; - const int start_cut_x = submat_block_device[block_x]; - const int end_cut_x = submat_block_device[block_x+1]; - - for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) { - const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut)); - const int i_cut_first = i_data.x; - const int delta_i = i_data.y; - const int i_cut_small = i_data.z; - - for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) { - const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut)); - const int j_cut_first = j_data.x; - const int delta_j = j_data.y; - const int j_cut_small = j_data.z; - - auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS; - auto* ABig_begin = A + i_cut_first + j_cut_first*LDA; - - int J; - for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - - double val[UNROLL_FACTOR]; - double* address[UNROLL_FACTOR]; -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - val[k] = ABig_begin[I + (J + k*WARP_Y)*LDA]; - address[k] = ASmall_begin + I + (J + k*WARP_Y) * LDAS; - } -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - // Suggest that the result be evicted first. -#if (CUDART_VERSION >= 11000) - __stcs(address[k], val[k]); -#else - asm ("st.global.cs.f64 [%0], %1;" :: "l"(address[k]), "d"(val[k])); -#endif - } - } - } - - for ( ; J < delta_j; J += WARP_Y) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA]; - } - } - } - } -} - - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - T* P_device, - size_t LDP, - cudaStream_t stream ) { - - dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks); - - const int submat_block_size = get_submat_cut_block(LDP, 0); - for (int i = 0; i < util::div_ceil(LDP, submat_block_size); i++) { - for (int j = 0; j < util::div_ceil(LDP, submat_block_size); j++) { - submat_set_combined_kernel<<< blocks, threads, 0, stream >>>( - ntasks, device_tasks, P_device, LDP, i, j - ); - } - } -} - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - double* P_device, - size_t LDP, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_pack_density.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_pack_density.hpp deleted file mode 100644 index ae90ef3..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_pack_density.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - T* P_device, - size_t LDP, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_weights.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_weights.cu deleted file mode 100644 index f8da49b..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_weights.cu +++ /dev/null @@ -1,641 +0,0 @@ -#include - -#include "device/cuda/cuda_weights.hpp" -#include "common/integrator_constants.hpp" -#include "device/cuda/cuda_extensions.hpp" -#include "device/cuda/cuda_device_properties.hpp" - -constexpr double eps_d = std::numeric_limits::epsilon(); - - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -__global__ void reciprocal_kernel(size_t length, double* vec) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < length; i += blockDim.x * gridDim.x) { - vec[i] = 1. / vec[i]; - } -} - -__global__ void compute_point_center_dist( - size_t npts, - size_t LDatoms, - size_t natoms, - const double* coords, - const double* points, - double* dist -) { - - __shared__ double3 point_buffer[warp_size]; - register double3 coord_reg; - - const int natoms_block = (natoms + warp_size-1) / warp_size; - const int coords_block = (npts + warp_size-1) / warp_size; - - const double3* coords_vec = (double3*) coords; - const double3* points_vec = (double3*) points; - - for (int j = blockIdx.x; j < natoms_block; j += gridDim.x) { - const int iAtom = j * warp_size + threadIdx.x; - // Load blocks into registers/shared memory - if (iAtom < natoms) { - coord_reg = coords_vec[iAtom]; - } - for (int i = blockIdx.y; i < coords_block; i += gridDim.y) { - const int iPt_load = i * warp_size + threadIdx.x; - if (iPt_load < npts) { - point_buffer[threadIdx.x] = points_vec[iPt_load]; - } - __syncthreads(); - - // do the computation - #pragma unroll 2 - for (int k = threadIdx.y; k < warp_size; k+=warp_size/2) { - const int iPt_sm = k; - const int iPt = i * warp_size + iPt_sm; - const double rx = point_buffer[iPt_sm].x - coord_reg.x; - const double ry = point_buffer[iPt_sm].y - coord_reg.y; - const double rz = point_buffer[iPt_sm].z - coord_reg.z; - - if (iAtom < natoms and iPt < npts) { - dist[ iAtom + iPt * LDatoms ] = std::sqrt( rx*rx + ry*ry + rz*rz ); - } - } - __syncthreads(); - } - } -} - -#if 0 -__global__ void modify_weights_becke_kernel( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - double* weights_device -) { - - // Becke partition functions - auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 - auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 - - - __shared__ double shared[2048]; - for( int ipt = blockIdx.x; ipt < npts; ipt += gridDim.x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - for( int iCenter = threadIdx.y; iCenter < natoms; iCenter += blockDim.y ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = threadIdx.x; jCenter < natoms; jCenter += blockDim.x ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - const double s = 0.5 * ( 1. - gBecke( mu ) ); - - ps *= (iCenter == jCenter) ? 1. : s ; - - } - - ps = warp_prod_reduce( ps ); // XXX: Assumes blockDim.x == 32 - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } - - // XXX: Assumes blockDim.x == blockDim.y == 32 - if( threadIdx.x == 0 ) { - shared[ threadIdx.y ] = sum; - shared[ threadIdx.y + 1024] = parent_weight; - } - - __syncthreads(); - sum = shared[ threadIdx.x ]; - sum = warpReduceSum( sum ); - - __syncthreads(); - parent_weight = shared[ threadIdx.x + 1024]; - parent_weight = __shfl_sync(0xffffffff, parent_weight, iParent % 32, 32 ); - - if( threadIdx.x == 0 and threadIdx.y == 0 ) - weights_device[ipt] *= parent_weight / sum; - - - } - - -} - - - -__global__ void modify_weights_ssf_kernel( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - - // Frisch partition functions - auto gFrisch = [](double x) { - - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - - auto sFrisch = [&] (double x) { - const double g = 0.5 * (1. - gFrisch(x)); - return (x >= magic_ssf_factor<>) ? 0. : (x <= -magic_ssf_factor<>) ? 1. : g; - }; - - constexpr double weight_tol = 1e-10; - - __shared__ double shared[2048]; - for( int ipt = blockIdx.x; ipt < npts; ipt += gridDim.x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - - for( int iCenter = threadIdx.y; iCenter < natoms; iCenter += blockDim.y ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = threadIdx.x; jCenter < natoms; jCenter += blockDim.x ) - if( fabs(ps) > weight_tol ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - const double s = sFrisch( mu ); - ps *= (iCenter == jCenter) ? 1. : s ; - - } - - ps = warp_prod_reduce( ps ); // XXX: Assumes blockDim.x == 32 - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } - - // XXX: Assumes blockDim.x == blockDim.y == 32 - if( threadIdx.x == 0 ) { - shared[ threadIdx.y ] = sum; - shared[ threadIdx.y + 1024] = parent_weight; - } - - __syncthreads(); - sum = shared[ threadIdx.x ]; - sum = warpReduceSum( sum ); - - __syncthreads(); - parent_weight = shared[ threadIdx.x + 1024]; - parent_weight = __shfl_sync(0xffffffff, parent_weight, iParent % 32, 32 ); - - if( threadIdx.x == 0 and threadIdx.y == 0 ) - weights_device[ipt] *= parent_weight / sum; - - - } - - -} -#endif - -// SIMT over points: 1D kernel -__global__ void modify_weights_ssf_kernel_1d( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - - // Frisch partition functions - auto gFrisch = [](double x) { - - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - -#if 0 - auto sFrisch = [&] (double x) { - const double g = 0.5 * (1. - gFrisch(x)); - return (x >= magic_ssf_factor<>) ? 0. : (x <= -magic_ssf_factor<>) ? 1. : g; - }; -#else - auto sFrisch = [&] (double x) { - if( fabs(x) < magic_ssf_factor<> ) return 0.5 * (1. - gFrisch(x)); - else if( x >= magic_ssf_factor<> ) return 0.; - else return 1.; - }; -#endif - - constexpr double weight_tol = 1e-10; - - const int tid_x = threadIdx.x + blockIdx.x * blockDim.x; - const int nt_x = blockDim.x * gridDim.x; - - //__shared__ double shared[2048]; - for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - -#if 0 - for( int iCenter = 0; iCenter < natoms; iCenter++ ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( fabs(ps) > weight_tol ) { - if( iCenter != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - ps *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } -#else - - // Do iParent First - { - - const double ri = local_dist_scratch[ iParent ]; - const double* const local_rab = RAB + iParent * natoms; - - parent_weight = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( parent_weight > weight_tol ) { - if( iParent != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - parent_weight *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - sum += parent_weight; - - } - - if( parent_weight < eps_d ) { - weights_device[ipt] = 0.; - continue; - } - - for( int iCenter = 0; iCenter < natoms; iCenter++ ) - if( iParent != iCenter ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( ps > weight_tol ) { - if( iCenter != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - ps *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - sum += ps; - - } - -#endif - - weights_device[ipt] *= parent_weight / sum; - - - } - - -} - -__device__ __inline__ double gFrisch(double x) { - // Frisch partition functions -// const double s_x = x / magic_ssf_factor<>; - const double s_x = x * 1.5625; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return ((35.) *(s_x - s_x3) + (21.) *s_x5 - (5.) *s_x7); -} - - -__device__ __inline__ double sFrisch(double x) { - //double frisch_val = (0.5 - (0.5/ 16.0) * gFrisch(x)); - - if( fabs(x) < magic_ssf_factor<> ) return (0.5 - (0.5/ 16.0) * gFrisch(x)); - else if( x >= magic_ssf_factor<> ) return 0.; - else return 1.; -} - -__global__ __launch_bounds__(weight_thread_block, weight_thread_block_per_sm) -void modify_weights_ssf_kernel_2d( - size_t npts, - size_t LDatoms, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - constexpr double weight_tol = 1e-10; - int natom_block = ((natoms + blockDim.x - 1) / blockDim.x) * blockDim.x; - - const int tid_x = threadIdx.y + blockIdx.y * blockDim.y; - const int nt_x = blockDim.y * gridDim.y; - - __shared__ int jCounter_sm[max_warps_per_thread_block]; - int* jCounter = reinterpret_cast(jCounter_sm) + threadIdx.y; - - // Each warp will work together on a point - for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * LDatoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - - // Do iParent First - { - - const double ri = local_dist_scratch[ iParent ]; - const double* const local_rab = RAB + iParent * LDatoms; - - parent_weight = 1.; - for( int jCenter = threadIdx.x; jCenter < natom_block; jCenter+=blockDim.x ) { - double contribution = 1.0; - if (jCenter < natoms && iParent != jCenter) { - const double rj = local_dist_scratch[ jCenter ]; - const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric - contribution = sFrisch( mu ); - } - contribution = warpReduceProd(contribution); - parent_weight *= contribution; - - if (parent_weight < weight_tol) break; - } - } - - if( parent_weight < eps_d ) { - if (threadIdx.x == 0) - weights_device[ipt] = 0.; - __syncwarp(); - continue; - } - - // Initialize each counter to 0 - if (threadIdx.x == 0) { - jCounter[0] = 0; - } - __syncwarp(); - - // Each thread will process an iCenter. Atomic operations are used to assign - // an iCenter value to each thread. - int iCenter = atomicAdd(jCounter, 1); - if (iCenter >= iParent) iCenter++; // iCenter == iParent is skipped - - // The entire warp processes the same jCenter value at the same time - int jCenter = 0; - - const double* local_rab = RAB + iCenter * LDatoms; - double ri = local_dist_scratch[ iCenter ]; - double ps = 1.; - int iCount = 0; - int cont = (iCenter < natoms); - - // We will continue iterating until all of the threads have cont set to 0 - while (__any_sync(0xffffffff, cont)) { - if (cont) { - double2 rj[weight_unroll/2]; - double2 rab_val[weight_unroll/2]; - double mu[weight_unroll]; - iCount += weight_unroll; - - #pragma unroll - for (int k = 0; k < weight_unroll/2; k++) { - rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); - rab_val[k] = *((double2*)(local_rab + jCenter) + k); - } - - #pragma unroll - for (int k = 0; k < weight_unroll/2; k++) { - mu[2*k+0] = (ri - rj[k].x) * rab_val[k].x; // XXX: RAB is symmetric - mu[2*k+1] = (ri - rj[k].y) * rab_val[k].y; - } - - #pragma unroll - for (int k = 0; k < weight_unroll; k++) { - if((iCenter != jCenter + k) && (jCenter + k < natoms)) { - mu[k] = sFrisch( mu[k] ); - ps *= mu[k]; - } - } - - // A thread is done with a iCenter based on 2 conditions. Weight tolerance - // Or if it has seen all of the jCenters - if( !(ps > weight_tol && iCount < LDatoms )) { - // In the case were the thread is done, it begins processing another iCenter - sum += ps; - iCenter = atomicAdd(jCounter, 1); - if (iCenter >= iParent) iCenter++; - - // If there are no more iCenters left to process, it signals it is ready to exit - cont = (iCenter < natoms); - ri = local_dist_scratch[ iCenter ]; - local_rab = RAB + iCenter * LDatoms; - ps = 1.; - iCount = 0; - } - } - // Wraps jCenter around. This was faster than modulo - jCenter += weight_unroll; - jCenter = (jCenter < LDatoms) ? jCenter : 0; - } - - // All of the threads then sum their contributions. Only thread 0 needs to add the parent - // contribution. - __syncwarp(); - sum = warpReduceSum(sum); - if (threadIdx.x == 0) { - sum += parent_weight; - weights_device[ipt] *= parent_weight / sum; - } - - __syncwarp(); - - } -} - - -void cuda_reciprocal(size_t length, double* vec, cudaStream_t stream) { - dim3 threads(max_threads_per_thread_block); - dim3 blocks( get_device_sm_count(0) ); - reciprocal_kernel<<>>(length, vec); -} - - -template -void partition_weights_cuda_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const F* points_device, - const int32_t* iparent_device, - const F* dist_nearest_device, - const F* rab_device, - const F* atomic_coords_device, - F* weights_device, - F* dist_scratch_device, - cudaStream_t stream ) { - - - - // Evaluate point-to-atom collocation - { - const int distance_thread_y = max_warps_per_thread_block / 2; - dim3 threads( warp_size, distance_thread_y ); - dim3 blocks( util::div_ceil( natoms, threads.x), - util::div_ceil( npts, threads.y * distance_thread_y) ); - - compute_point_center_dist<<< blocks, threads, 0, stream>>>( - npts, LDatoms, natoms, atomic_coords_device, points_device, dist_scratch_device - ); - - } - const bool partition_weights_1d_kernel = true; - - if( partition_weights_1d_kernel ) { - - dim3 threads( warp_size, weight_thread_block / warp_size ); - dim3 blocks( 1, get_device_sm_count(0) * weight_thread_block_per_sm); - modify_weights_ssf_kernel_2d<<< blocks, threads, 0, stream >>>( - npts, LDatoms, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, dist_nearest_device, weights_device - ); - - } else { - -#if 0 - dim3 threads( 32, 32 ); - dim3 blocks ( npts, 1 ); - - if( weight_alg == XCWeightAlg::SSF ) - modify_weights_ssf_kernel<<< blocks, threads, 0, stream >>>( - npts, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, dist_nearest_device, weights_device - ); - else - modify_weights_becke_kernel<<< blocks, threads, 0, stream >>>( - npts, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, weights_device - ); -#endif - - } - - -} - -template -void partition_weights_cuda_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const double* points_device, - const int32_t* iparent_device, - const double* dist_nearest_device, - const double* rab_device, - const double* atomic_coords_device, - double* weights_device, - double* dist_scratch_device, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_weights.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_weights.hpp deleted file mode 100644 index 2418cfc..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_weights.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include -#include -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - - -void cuda_reciprocal(size_t length, double* vec, cudaStream_t stream); - -template -void partition_weights_cuda_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const F* points_device, - const int32_t* iparent_device, - const F* dist_nearest_device, - const F* rab_device, - const F* atomic_coords_device, - F* weights_device, - F* dist_scratch_device, - cudaStream_t stream ); - - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_zmat.cu b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_zmat.cu deleted file mode 100644 index df0cbe3..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_zmat.cu +++ /dev/null @@ -1,140 +0,0 @@ -#include "device/cuda/cuda_zmat.hpp" -#include -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - - -template -__global__ void zmat_lda_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - const auto nbf = task.nbe; - const auto* vrho_device = task.vrho; - - const auto* basis_eval_device = task.bf; - - auto* z_matrix_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nbf ) { - - const size_t ibfoff = tid_y * npts + tid_x; - const double fact = 0.5 * vrho_device[tid_x]; - - z_matrix_device[ ibfoff ] = fact * basis_eval_device[ ibfoff ]; - - } - -} - - - - -template -void zmat_lda_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - - dim3 threads(warp_size,max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - - zmat_lda_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} - -template -void zmat_lda_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - - - - -template -__global__ void zmat_gga_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - const auto nbf = task.nbe; - const auto* vrho_device = task.vrho; - const auto* vgamma_device = task.vgamma; - const auto* den_x_eval_device = task.ddenx; - const auto* den_y_eval_device = task.ddeny; - const auto* den_z_eval_device = task.ddenz; - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - auto* z_matrix_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nbf ) { - - const size_t ibfoff = tid_y * npts + tid_x; - const double fact_1 = 0.5 * vrho_device[tid_x] ; - const double fact_2 = 2.0 * vgamma_device[tid_x]; - - const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ]; - const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ]; - const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ]; - - z_matrix_device[ ibfoff ] = - fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz ); - - } -} - -template -void zmat_gga_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ) { - - - dim3 threads(warp_size,max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - - zmat_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - -} -template -void zmat_gga_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_zmat.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_zmat.hpp deleted file mode 100644 index 58769d8..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/cuda_zmat.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace cuda { - -using namespace GauXC::cuda; - -template -void zmat_lda_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -template -void zmat_gga_cuda( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - cudaStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/gauxc-cuda.cmake b/third_party/gauxc/attic/src/new_integrator/device/cuda/gauxc-cuda.cmake deleted file mode 100644 index e95000a..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/gauxc-cuda.cmake +++ /dev/null @@ -1,49 +0,0 @@ -if( NOT TARGET CUDA::cublas ) - find_package( CUDAToolkit REQUIRED ) -endif() -include( gauxc-cub ) - -target_sources( gauxc PRIVATE - # Common CUDA Utilities - device/cuda/collocation_device.cu - device/cuda/xc_cuda_data.cxx - device/cuda/cuda_weights.cu - device/cuda/cuda_pack_density.cu - device/cuda/cuda_eval_denvars.cu - device/cuda/cublas_extensions.cu - device/cuda/cuda_inc_potential.cu - device/cuda/cuda_device_properties.cxx - - # XC Specific - device/cuda/cuda_zmat.cu - - # Drivers - device/cuda/local_work_replicated_incore_exc_vxc.cxx - -) - -target_compile_features( gauxc PRIVATE cuda_std_14 ) -target_compile_options( gauxc - PRIVATE - $<$: -Xcudafe --diag_suppress=partial_override -Xptxas -v > -) - - -if( GAUXC_ENABLE_MAGMA ) - - message( STATUS "MAGMA Has Been Enabled" ) - find_package( MAGMA REQUIRED ) - target_link_libraries( gauxc PUBLIC MAGMA::magma ) - -else() - - message( STATUS "MAGMA Has Been Explicitly Disabled" ) - -endif() - -if(NOT GAUXC_LINK_CUDA_STATIC) - target_link_libraries( gauxc PUBLIC CUDA::cublas ) -else() - target_link_libraries( gauxc PUBLIC CUDA::cublas_static ) -endif() -target_link_libraries( gauxc PRIVATE $ ) diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/local_work_replicated_incore_exc_vxc.cxx b/third_party/gauxc/attic/src/new_integrator/device/cuda/local_work_replicated_incore_exc_vxc.cxx deleted file mode 100644 index 4dc2826..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/local_work_replicated_incore_exc_vxc.cxx +++ /dev/null @@ -1,422 +0,0 @@ -#include -#include -#include - -#include "device/cuda/cuda_weights.hpp" -#include "device/cuda/collocation_device.hpp" -#include "device/cuda/cuda_pack_density.hpp" -#include "device/cuda/cuda_inc_potential.hpp" -#include "device/cuda/cuda_eval_denvars.hpp" -#include "device/cuda/cuda_zmat.hpp" -#include "common/integrator_common.hpp" - -#include "device/cuda/cublas_extensions.hpp" -#include "device/cuda/local_work_replicated_incore_exc_vxc.hpp" - -#include "device/cuda/xc_cuda_data.hpp" - -namespace GauXC { - -namespace integrator::cuda { - -using namespace GauXC::cuda::blas; - - -template -using cuda_task_iterator = typename std::vector>::iterator; - -template -void local_work_replicated_density_incore_exc_vxc( - XCWeightAlg weight_alg, - const functional_type& func, - XCCudaData& cuda_data, - cuda_task_iterator task_begin, - cuda_task_iterator task_end -) { - - const auto ntasks = std::distance( task_begin, task_end ); - const auto nbf = cuda_data.nbf; - - // Get batch statistics for batches to process - auto nbe_comparator = - []( const auto& a, const auto& b ){ return a.nbe < b.nbe; }; - auto npts_comparator = - []( const auto& a, const auto& b ){ return a.npts < b.npts; }; - auto nshells_comparator = - []( const auto& a, const auto& b ){ return a.nshells < b.nshells; }; - - auto [min_nbe_it, max_nbe_it] = - std::minmax_element( task_begin, task_end, nbe_comparator ); - auto [min_npts_it, max_npts_it] = - std::minmax_element( task_begin, task_end, npts_comparator ); - auto [min_nshells_it, max_nshells_it] = - std::minmax_element( task_begin, task_end, nshells_comparator ); - - const auto min_nbe = min_nbe_it->nbe; - const auto max_nbe = max_nbe_it->nbe; - const auto min_npts = min_npts_it->npts; - const auto max_npts = max_npts_it->npts; - const auto min_nshells = min_nshells_it->nshells; - const auto max_nshells = max_nshells_it->nshells; - - util::unused( min_nbe, min_npts, min_nshells ); - - const size_t total_npts = - std::accumulate( task_begin, task_end, 0ul, - []( const auto& a, const auto& b ) { return a + b.npts; } ); - - - // Aliases - cudaStream_t master_stream = *cuda_data.master_stream; - cublasHandle_t master_handle = *cuda_data.master_handle; - -#ifdef GAUXC_ENABLE_MAGMA - magma_queue_t master_queue = *cuda_data.master_magma_queue; -#endif - - auto* dmat_device = cuda_data.dmat_device; - - auto* shells_device = cuda_data.shells_device; - auto* tasks_device = cuda_data.device_tasks; - auto* dmat_array_device = cuda_data.dmat_array_device; - auto* zmat_array_device = cuda_data.zmat_array_device; - auto* bf_array_device = cuda_data.bf_array_device; - auto* weights_device = cuda_data.weights_device_buffer; - auto* dist_scratch_device = cuda_data.dist_scratch_device; - - auto* den_eval_device = cuda_data.den_eval_device; - auto* dden_x_eval_device = cuda_data.den_x_eval_device; - auto* dden_y_eval_device = cuda_data.den_y_eval_device; - auto* dden_z_eval_device = cuda_data.den_z_eval_device; - - auto* eps_eval_device = cuda_data.eps_eval_device; - auto* gamma_eval_device = cuda_data.gamma_eval_device; - auto* vrho_eval_device = cuda_data.vrho_eval_device; - auto* vgamma_eval_device = cuda_data.vgamma_eval_device; - - - auto* exc_device = cuda_data.exc_device; - auto* vxc_device = cuda_data.vxc_device; - auto* nel_device = cuda_data.nel_device; - auto* acc_scr_device = cuda_data.acc_scr_device; - - auto* m_array_device = cuda_data.m_array_device; - auto* n_array_device = cuda_data.n_array_device; - auto* k_array_device = cuda_data.k_array_device; - auto* lda_array_device = cuda_data.lda_array_device; - auto* ldb_array_device = cuda_data.ldb_array_device; - auto* ldc_array_device = cuda_data.ldc_array_device; - - - const auto* rab_device = cuda_data.rab_device; - const auto* coords_device = cuda_data.coords_device; - const auto* points_device = cuda_data.points_device_buffer; - const auto* iparent_device = cuda_data.iparent_device_buffer; - const auto* dist_nearest_device = cuda_data.dist_nearest_buffer; - - - - - // Evaluate Partition Weights - partition_weights_cuda_SoA( weight_alg, total_npts, cuda_data.LDatoms, cuda_data.natoms, - points_device, iparent_device, dist_nearest_device, - rab_device, coords_device, weights_device, - dist_scratch_device, master_stream ); - - - // Evaluate Collocation - if constexpr ( n_deriv == 1 ) - eval_collocation_masked_combined_deriv1( ntasks, max_npts, max_nshells, - shells_device, tasks_device, - master_stream ); - else - eval_collocation_masked_combined( ntasks, max_npts, max_nshells, shells_device, - tasks_device, master_stream ); - - // Pack Density Submatrices - task_pack_density_matrix( ntasks, tasks_device, dmat_device, nbf, master_stream ); - - - // Form Z = P * X - if( cuda_data.batch_l3_blas ) { - -#ifdef GAUXC_ENABLE_MAGMA - - magmablas_dgemm_vbatched( MagmaNoTrans, MagmaNoTrans, - m_array_device, n_array_device, k_array_device, - 1., bf_array_device, ldb_array_device, - dmat_array_device, lda_array_device, - 0., zmat_array_device, ldc_array_device, - ntasks, master_queue ); - -#else - - throw std::runtime_error("BATCHED BLAS API NOT SUPPORTED"); - -#endif - - } else { - - int nstream = cuda_data.blas_streams.size(); - - // Wait for collocation etc - util::cuda_event master_event; - master_event.record( master_stream ); - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.blas_streams[iS].wait( master_event ); - - // Do GEMM in round-robin - for( auto iT = 0; iT < ntasks; ++iT ) { - auto& task = *(task_begin + iT); - gemm( cuda_data.blas_handles[iT % nstream], CUBLAS_OP_N, CUBLAS_OP_N, - task.npts, task.nbe, task.nbe, 1., task.bf, task.npts, - task.nbe_scr, task.nbe, 0., task.zmat, task.npts ); - } - - // Record completion of BLAS ops - std::vector< util::cuda_event > blas_events( nstream ); - for( int iS = 0; iS < nstream; ++iS ) - blas_events[iS].record( cuda_data.blas_streams[iS] ); - - // Wait on master stream for all BLAS ops to complete - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.master_stream->wait( blas_events[iS] ); - - } - - - - // Zero UVars - util::cuda_set_zero_async( total_npts, den_eval_device, master_stream, "DenZero" ); - if( func.is_gga() ) { - util::cuda_set_zero_async( total_npts, dden_x_eval_device, master_stream, - "DenXZero" ); - util::cuda_set_zero_async( total_npts, dden_y_eval_device, master_stream, - "DenYZero" ); - util::cuda_set_zero_async( total_npts, dden_z_eval_device, master_stream, - "DenZZero" ); - } - - // Evaluate UVars - if( func.is_gga() ) { - eval_uvars_gga_device( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - eval_vvars_gga_device( total_npts, dden_x_eval_device, dden_y_eval_device, - dden_z_eval_device, gamma_eval_device, master_stream ); - } else { - eval_uvars_lda_device( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - } - - // Evaluate XC Functional - if( func.is_gga() ) - func.eval_exc_vxc_device( total_npts, den_eval_device, gamma_eval_device, - eps_eval_device, vrho_eval_device, - vgamma_eval_device, master_stream ); - else - func.eval_exc_vxc_device( total_npts, den_eval_device, eps_eval_device, - vrho_eval_device, master_stream ); - - - // Factor weights into XC output - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - eps_eval_device, 1 ); - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - vrho_eval_device, 1 ); - if( func.is_gga() ) - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - vgamma_eval_device, 1 ); - - // Accumulate EXC / NEL - gdot( master_handle, total_npts, weights_device, 1, - den_eval_device, 1, acc_scr_device, nel_device ); - gdot( master_handle, total_npts, eps_eval_device, 1, - den_eval_device, 1, acc_scr_device, exc_device ); - - // Evaluate Z Matrix - if( func.is_gga() ) - zmat_gga_cuda( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - else - zmat_lda_cuda( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - - - - // Accumulate packed VXC = X * Z**T + Z * X**T - - - if( cuda_data.batch_l3_blas ) { - -#ifdef GAUXC_ENABLE_MAGMA - - // XXX: Only updates LT - magmablas_dsyr2k_vbatched( MagmaLower, MagmaTrans, - n_array_device, m_array_device, - 1., bf_array_device, ldb_array_device, - zmat_array_device, ldc_array_device, - 0., dmat_array_device, lda_array_device, - ntasks, master_queue ); - -#else - - throw std::runtime_error("BATCHED BLAS API NOT SUPPORTED"); - -#endif - } else { - - int nstream = cuda_data.blas_streams.size(); - - // Wait for zmat, etc - util::cuda_event master_event; - master_event.record( master_stream ); - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.blas_streams[iS].wait( master_event ); - - // Do SYR2K in round-robin - for( auto iT = 0; iT < ntasks; ++iT ) { - auto& task = *(task_begin + iT); - syr2k( cuda_data.blas_handles[iT % nstream], CUBLAS_FILL_MODE_LOWER, - CUBLAS_OP_T, task.nbe, task.npts, 1., task.bf, task.npts, - task.zmat, task.npts, 0., task.nbe_scr, task.nbe ); - } - - // Record completion of BLAS ops - std::vector< util::cuda_event > blas_events( nstream ); - for( int iS = 0; iS < nstream; ++iS ) - blas_events[iS].record( cuda_data.blas_streams[iS] ); - - // Wait on master stream for all BLAS ops to complete - for( int iS = 0; iS < nstream; ++iS ) - cuda_data.master_stream->wait( blas_events[iS] ); - } - - // Increment global VXC - task_inc_potential( ntasks, tasks_device, vxc_device, nbf, master_stream ); - - - // Synchronize on master stream - // XXX: There's no lifetime issues in this driver, should look into - // avoid this sync to allow for overlap with the host packing - cudaStreamSynchronize( master_stream ); - -} - - -template -void local_work_replicated_incore_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* EXC, - F* NEL -) { - - auto& cuda_data = dynamic_cast< XCCudaData& >( device_data ); - - auto task_comparator = []( const XCTask& a, const XCTask& b ) { - return (a.points.size() * a.nbe) > (b.points.size() * b.nbe); - }; - std::sort( local_work_begin, local_work_end, task_comparator ); - - - const auto nbf = basis.nbf(); - const auto natoms = meta.natoms(); - const auto LDatoms = cuda_data.LDatoms; - - // Send static data to the device - - // Density - util::cuda_copy( nbf * nbf, cuda_data.dmat_device, P, "P H2D" ); - - // Shells: TODO avoid host copy? - std::vector> shells( basis ); - util::cuda_copy( shells.size(), cuda_data.shells_device, shells.data(), - "Shells H2D" ); - - // RAB - util::cuda_copy_2d( cuda_data.rab_device, LDatoms * sizeof(F), - meta.rab().data(), natoms * sizeof(F), - natoms * sizeof(F), natoms, "RAB H2D"); - // This could probably happen on the host - cuda_reciprocal(natoms * LDatoms, cuda_data.rab_device, 0); - - // Atomic coordinates - std::vector coords( 3*natoms ); - for( auto i = 0ul; i < natoms; ++i ) { - coords[ 3*i + 0 ] = mol[i].x; - coords[ 3*i + 1 ] = mol[i].y; - coords[ 3*i + 2 ] = mol[i].z; - } - util::cuda_copy( 3 * natoms, cuda_data.coords_device, coords.data(), - "Coords H2D" ); - - - // Zero out XC quantities - util::cuda_set_zero( nbf * nbf, cuda_data.vxc_device, "VXC Zero" ); - util::cuda_set_zero( 1 , cuda_data.exc_device, "EXC Zero" ); - util::cuda_set_zero( 1 , cuda_data.nel_device, "NEL Zero" ); - - - - // Processes batches in groups that saturadate available device memory - auto task_it = local_work_begin; - while( task_it != local_work_end ) { - - // Determine next task batch, send relevant data to device - auto [it, tasks_device] = - cuda_data.generate_buffers( basis, task_it, local_work_end ); - - - // Process the batches - local_work_replicated_density_incore_exc_vxc( - weight_alg, func, cuda_data, tasks_device.begin(), tasks_device.end() - ); - - task_it = it; - - } - - // Receive XC terms from host - util::cuda_copy( nbf * nbf, VXC, cuda_data.vxc_device, "VXC D2H" ); - - util::cuda_copy( 1, EXC, cuda_data.exc_device, "EXC D2H" ); - util::cuda_copy( 1, NEL, cuda_data.nel_device, "NEL D2H" ); - - // Symmetrize VXC - for( int32_t j = 0; j < nbf; ++j ) - for( int32_t i = j+1; i < nbf; ++i ) - VXC[ j + i*nbf ] = VXC[ i + j*nbf ]; - -} - - -#define CUDA_IMPL( F, ND ) \ -template \ -void local_work_replicated_incore_exc_vxc_impl(\ - XCWeightAlg weight_alg,\ - XCIntegratorState state,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCDeviceData & device_data,\ - host_task_iterator local_work_begin,\ - host_task_iterator local_work_end,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -CUDA_IMPL( double, 0 ); -CUDA_IMPL( double, 1 ); - -} -} - - diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/local_work_replicated_incore_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/local_work_replicated_incore_exc_vxc.hpp deleted file mode 100644 index dfc65a8..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/local_work_replicated_incore_exc_vxc.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include - -#include "device/xc_device_data.hpp" - -namespace GauXC { - -namespace integrator::cuda { - -using host_task_iterator = std::vector::iterator; - -template -void local_work_replicated_incore_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* exc, - F* n_el -); - - -template -inline void local_work_replicated_incore_exc_vxc( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - local_work_replicated_incore_exc_vxc_impl( std::forward(args)... ); - else if( n_deriv == 1 ) - local_work_replicated_incore_exc_vxc_impl( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/xc_cuda_data.cxx b/third_party/gauxc/attic/src/new_integrator/device/cuda/xc_cuda_data.cxx deleted file mode 100644 index 5a6df34..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/xc_cuda_data.cxx +++ /dev/null @@ -1,552 +0,0 @@ -#include "device/cuda/xc_cuda_data.hpp" -#include - -#include "device/buffer_adaptor.hpp" -#include "common/integrator_common.hpp" -#include "device/cuda/cuda_device_properties.hpp" - -namespace GauXC { - - -namespace integrator::device { - -template -std::shared_ptr< XCDeviceData > make_device_data() { - return std::make_shared< XCCudaData >(); -} - -template std::shared_ptr> make_device_data(); - -} - - - - - - - - -template -XCCudaData::XCCudaData( bool _batch_l3_blas ): -#ifdef GAUXC_ENABLE_MAGMA - batch_l3_blas(_batch_l3_blas) -#else - batch_l3_blas(false) -#endif -{ - - // TODO: Expose this - double fill_fraction = 0.9; - - cudaError_t stat; - - // Get Total Available Memory - size_t cuda_avail, cuda_total; - stat = cudaMemGetInfo( &cuda_avail, &cuda_total ); - GAUXC_CUDA_ERROR( "MemInfo Failed", stat ); - - // Allocate up to fill_fraction - devmem_sz = fill_fraction * cuda_avail; - stat = cudaMalloc( &device_ptr, devmem_sz ); - GAUXC_CUDA_ERROR( "CUDA Malloc Failed", stat ); - - // Create CUDA Stream and CUBLAS Handles and make them talk to eachother - master_stream = std::make_unique< util::cuda_stream >(); - master_handle = std::make_unique< util::cublas_handle >(); - - cublasSetStream( *master_handle, *master_stream ); - -#ifdef GAUXC_ENABLE_MAGMA - // Create MAGMA Queue from CUDA Stream and CUBLAS Handle - master_magma_queue = - std::make_unique< util::magma_queue >( 0, *master_stream, *master_handle ); -#endif - - if( not batch_l3_blas ) { - - // Create BLAS streams - blas_streams.resize(4); - blas_handles.resize(4); - for( auto i = 0; i < 4; ++i ) - cublasSetStream( blas_handles[i], blas_streams[i] ); - - } - -} - - - -template -XCCudaData::~XCCudaData() noexcept { - if( device_ptr ) util::cuda_free( device_ptr ); -} - - - - - - - -template -void XCCudaData::allocate_static_data( size_t _natoms, - size_t _n_deriv, - size_t _nbf, - size_t _nshells ) { - - - // Save state - nshells = _nshells; - nbf = _nbf; - n_deriv = _n_deriv; - natoms = _natoms; - - LDatoms = util::div_ceil( natoms, cuda::weight_unroll ) * cuda::weight_unroll; - - // Allocate static memory with proper alignment - buffer_adaptor mem( device_ptr, devmem_sz ); - - shells_device = mem.aligned_alloc>( nshells ); - exc_device = mem.aligned_alloc( 1 ); - nel_device = mem.aligned_alloc( 1 ); - acc_scr_device = mem.aligned_alloc( 1 ); - rab_device = mem.aligned_alloc( LDatoms * natoms, sizeof(double2)); - coords_device = mem.aligned_alloc( 3 * natoms ); - - vxc_device = mem.aligned_alloc( nbf * nbf ); - dmat_device = mem.aligned_alloc( nbf * nbf ); - - // Get current stack location - dynmem_ptr = mem.stack(); - dynmem_sz = mem.nleft(); - -} - - - - -using task_iterator = std::vector< XCTask >::iterator; -template -using device_task_container = std::vector< cuda::XCTaskDevice >; - -template -std::tuple< task_iterator, device_task_container > - XCCudaData::generate_buffers( const BasisSet& basis, - task_iterator task_begin, - task_iterator task_end ) { - - // Host data packing arrays - std::vector< std::array > points_pack; - std::vector< double > weights_pack; - std::vector< size_t > shell_list_pack; - std::vector< size_t > shell_offs_pack; - std::vector< std::array > submat_cut_pack; - std::vector< int32_t > submat_block_pack; - std::vector< int32_t > iparent_pack; - std::vector< double > dist_nearest_pack; - - // Host copies for batched GEMM/SYRK arrays - std::vector< double* > dmat_array, bf_array, zmat_array; - std::vector< int > m_array, n_array, k_array, lda_array, ldb_array, ldc_array; - - device_task_container tasks_device; - - - auto concat_iterable = []( auto& a, const auto& b ) { - a.insert( a.end(), b.begin(), b.end() ); - }; - - - size_t ntask = 0; - size_t total_npts = 0; - size_t total_nbe_nbe = 0; - size_t total_nbe_npts = 0; - size_t total_nshells = 0; - size_t total_ncut = 0; - size_t total_nblock = 0; - size_t memleft = dynmem_sz; - - uint32_t submat_chunk_size = cuda::get_submat_cut_block(nbf, 0); - - // Offset memory by the static requirement of an extra pointer element - // for each of the size batch arrays in MAGMA - memleft -= 6 * sizeof(int); //M,N,K,LDA,LDB,LDC - - auto task_it = task_begin; - while( task_it != task_end ) { - - auto iAtom = task_it->iParent; - auto points = task_it->points ; - auto weights = task_it->weights ; - auto shell_list = task_it->shell_list; - auto nbe = task_it->nbe; - auto dist_nearest = task_it->dist_nearest; - - // Generate map from compressed to non-compressed matrices - auto [submat_cut, submat_block] = integrator::gen_compressed_submat_map( basis, shell_list, nbf, submat_chunk_size ); - size_t ncut = submat_cut.size(); - size_t nblock = submat_block.size(); - size_t nshells = shell_list.size(); - size_t npts = points.size(); - - - size_t mem_points = 3 * npts; - size_t mem_weights = npts; - - size_t mem_shells = nshells; - size_t mem_shell_list = nshells; - size_t mem_shell_offs = nshells; - size_t mem_submat_cut = 3 * ncut; - size_t mem_submat_block = nblock; - - size_t mem_nbe_scr = nbe * nbe; - size_t mem_zmat = nbe * npts; - - size_t mem_bf = nbe * npts; - size_t mem_dbfx = mem_bf; - size_t mem_dbfy = mem_bf; - size_t mem_dbfz = mem_bf; - - size_t mem_den = npts; - size_t mem_denx = npts; - size_t mem_deny = npts; - size_t mem_denz = npts; - - size_t mem_eps = npts; - size_t mem_gamma = npts; - size_t mem_vrho = npts; - size_t mem_vgamma = npts; - - //size_t mem_partition_scr = natoms * npts; - size_t mem_dist_scr = LDatoms * npts; - size_t mem_iparent = npts; - size_t mem_dist_nearest = npts; - - size_t mem_batch_mat_arr = 3; // dmat/zmat/bf - size_t mem_batch_sz_arr = 6; // M/N/K/LDA/LDB/LDC - size_t mem_task = 1; - - - size_t mem_req_batch = - mem_points * sizeof(double) + - mem_weights * sizeof(double) + - mem_shells * sizeof(Shell) + - mem_shell_list * sizeof(size_t) + - mem_shell_offs * sizeof(size_t) + - mem_submat_cut * sizeof(int32_t) + - mem_submat_block * sizeof(int32_t) + - mem_nbe_scr * sizeof(double) + - mem_zmat * sizeof(double) + - mem_bf * sizeof(double) + - mem_dbfx * sizeof(double) + - mem_dbfy * sizeof(double) + - mem_dbfz * sizeof(double) + - mem_den * sizeof(double) + - mem_denx * sizeof(double) + - mem_deny * sizeof(double) + - mem_denz * sizeof(double) + - mem_eps * sizeof(double) + - mem_gamma * sizeof(double) + - mem_vrho * sizeof(double) + - mem_vgamma * sizeof(double) + - //mem_partition_scr * sizeof(double) + - mem_dist_scr * sizeof(double) + - mem_iparent * sizeof(int32_t) + - mem_dist_nearest * sizeof(double) + - mem_batch_mat_arr * sizeof(double*) + - mem_batch_sz_arr * sizeof(int32_t) + - mem_task * sizeof(cuda::XCTaskDevice); - - //std::cout << "Memory requirement for task " << ntask+1 << " " << mem_req_batch << " memleft " << memleft << std::endl; - - if( mem_req_batch > memleft ) break; - - // Update memory and increment task iterator - memleft -= mem_req_batch; - ntask++; - task_it++; - - // Update counters - total_npts += npts; - total_nbe_nbe += nbe*nbe; - total_nbe_npts += nbe*npts; - total_nshells += nshells; - total_ncut += ncut; - total_nblock += nblock; - - // Compute offsets - std::vector< size_t > shell_offs( nshells ); - shell_offs.at(0) = 0; - for( auto i = 1ul; i < nshells; ++i ) - shell_offs.at(i) = shell_offs.at(i-1) + - basis.at( shell_list.at(i-1) ).size(); - - - // Pack the data on host - concat_iterable( points_pack, points ); - concat_iterable( weights_pack, weights ); - concat_iterable( shell_list_pack, shell_list ); - concat_iterable( shell_offs_pack, shell_offs ); - concat_iterable( submat_cut_pack, submat_cut ); - concat_iterable( submat_block_pack, submat_block ); - - m_array.emplace_back( npts ); - n_array.emplace_back( nbe ); - k_array.emplace_back( nbe ); - - lda_array.emplace_back( nbe ); - ldb_array.emplace_back( npts ); - ldc_array.emplace_back( npts ); - - iparent_pack.insert( iparent_pack.end(), npts, iAtom ); - dist_nearest_pack.insert( dist_nearest_pack.end(), npts, dist_nearest ); - - // Add task - tasks_device.emplace_back(); - - tasks_device.back().nbe = nbe; - tasks_device.back().npts = npts; - tasks_device.back().ncut = ncut; - tasks_device.back().nblock = nblock; - tasks_device.back().nshells = nshells; - tasks_device.back().iParent = iAtom; - tasks_device.back().dist_nearest = dist_nearest; - } - - - std::cout << "XCDeviceData will stack allocate for " << tasks_device.size() << " tasks"; - std::cout << " Using chunk size of " << submat_chunk_size << std::endl; - - // Allocate out of dynamic memory - buffer_adaptor mem( dynmem_ptr, dynmem_sz ); - - // (possibly) Large types - important_shells_device = mem.aligned_alloc>( total_nshells ); - device_tasks = mem.aligned_alloc>( ntask ); - - // 64-bit types - nbe_scr_device = mem.aligned_alloc( total_nbe_nbe ); - zmat_device = mem.aligned_alloc( total_nbe_npts ); - bf_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_x_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_y_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_z_eval_device = mem.aligned_alloc( total_nbe_npts ); - - den_eval_device = mem.aligned_alloc( total_npts ); - eps_eval_device = mem.aligned_alloc( total_npts ); - vrho_eval_device = mem.aligned_alloc( total_npts ); - - den_x_eval_device = mem.aligned_alloc( total_npts ); - den_y_eval_device = mem.aligned_alloc( total_npts ); - den_z_eval_device = mem.aligned_alloc( total_npts ); - gamma_eval_device = mem.aligned_alloc( total_npts ); - vgamma_eval_device = mem.aligned_alloc( total_npts ); - - points_device_buffer = mem.aligned_alloc( 3 * total_npts ); - weights_device_buffer = mem.aligned_alloc( total_npts ); - shell_list_device_buffer = mem.aligned_alloc( total_nshells ); - shell_offs_device_buffer = mem.aligned_alloc( total_nshells ); - submat_cut_device_buffer = mem.aligned_alloc( 3 * total_ncut ); - submat_block_device_buffer = mem.aligned_alloc( total_nblock ); - - dist_scratch_device = mem.aligned_alloc( LDatoms * total_npts, 2 * sizeof(double) ); - dist_nearest_buffer = mem.aligned_alloc( total_npts ); - - dmat_array_device = mem.aligned_alloc( ntask ); - zmat_array_device = mem.aligned_alloc( ntask ); - bf_array_device = mem.aligned_alloc( ntask ); - - // 32-bit types - m_array_device = mem.aligned_alloc( ntask + 1 ); - n_array_device = mem.aligned_alloc( ntask + 1 ); - k_array_device = mem.aligned_alloc( ntask + 1 ); - lda_array_device = mem.aligned_alloc( ntask + 1 ); - ldb_array_device = mem.aligned_alloc( ntask + 1 ); - ldc_array_device = mem.aligned_alloc( ntask + 1 ); - - iparent_device_buffer = mem.aligned_alloc( total_npts ); - - - // Update tasks with allocated pointers - { - double* points_ptr = points_device_buffer; - double* weights_ptr = weights_device_buffer; - - size_t* shell_list_ptr = shell_list_device_buffer; - size_t* shell_offs_ptr = shell_offs_device_buffer; - int32_t* submat_cut_ptr = submat_cut_device_buffer; - int32_t* submat_block_ptr = submat_block_device_buffer; - Shell * shells_ptr = important_shells_device; - double* nbe_ptr = nbe_scr_device; - double* zmat_ptr = zmat_device; - - double* bf_ptr = bf_eval_device; - double* dbfx_ptr = dbf_x_eval_device; - double* dbfy_ptr = dbf_y_eval_device; - double* dbfz_ptr = dbf_z_eval_device; - - double* den_ptr = den_eval_device; - double* ddenx_ptr = den_x_eval_device; - double* ddeny_ptr = den_y_eval_device; - double* ddenz_ptr = den_z_eval_device; - - double* eps_ptr = eps_eval_device; - double* gamma_ptr = gamma_eval_device; - double* vrho_ptr = vrho_eval_device; - double* vgamma_ptr = vgamma_eval_device; - - - double* dist_scratch_ptr = dist_scratch_device; - - for( auto& task : tasks_device ) { - - task.points = points_ptr; - task.weights = weights_ptr; - task.shell_list = shell_list_ptr; - task.shell_offs = shell_offs_ptr; - task.submat_cut = submat_cut_ptr; - task.submat_block = submat_block_ptr; - - task.shells = shells_ptr; - task.nbe_scr = nbe_ptr; - task.zmat = zmat_ptr; - task.bf = bf_ptr; - task.dbfx = dbfx_ptr; - task.dbfy = dbfy_ptr; - task.dbfz = dbfz_ptr; - task.den = den_ptr; - task.ddenx = ddenx_ptr; - task.ddeny = ddeny_ptr; - task.ddenz = ddenz_ptr; - - task.eps = eps_ptr; - task.gamma = gamma_ptr; - task.vrho = vrho_ptr; - task.vgamma = vgamma_ptr; - - task.dist_scratch = dist_scratch_ptr; - - auto npts = task.npts; - auto nbe = task.nbe; - auto nshells = task.nshells; - auto ncut = task.ncut; - auto nblock = task.nblock; - - points_ptr += 3 * npts; - weights_ptr += npts; - shell_list_ptr += nshells; - shell_offs_ptr += nshells; - submat_cut_ptr += 3 * ncut; - submat_block_ptr += nblock; - - shells_ptr += nshells; - nbe_ptr += nbe * nbe; - zmat_ptr += nbe * npts; - - bf_ptr += nbe * npts; - dbfx_ptr += nbe * npts; - dbfy_ptr += nbe * npts; - dbfz_ptr += nbe * npts; - - den_ptr += npts; - ddenx_ptr += npts; - ddeny_ptr += npts; - ddenz_ptr += npts; - - eps_ptr += npts; - gamma_ptr += npts; - vrho_ptr += npts; - vgamma_ptr += npts; - - dist_scratch_ptr += LDatoms * npts; - - - - // Batched LA - dmat_array.emplace_back( task.nbe_scr ); - bf_array.emplace_back( task.bf ); - zmat_array.emplace_back( task.zmat ); - } - - } // End task setup - - - - - auto copy_rev = [&]( size_t n, const auto* src, auto* dest, cudaStream_t stream, - std::string m ) { - util::cuda_copy_async( n, dest, src, stream, m ); - }; - - - - try { - - // Send the data to the device - copy_rev( 3*points_pack.size(), points_pack.data()->data(), - points_device_buffer, *master_stream, - "send points buffer" ); - copy_rev( weights_pack.size(), weights_pack.data(), - weights_device_buffer, *master_stream, - "send weights buffer" ); - - copy_rev( shell_list_pack.size(), shell_list_pack.data(), - shell_list_device_buffer, *master_stream, - "send_shell_list_buffer" ); - copy_rev( shell_offs_pack.size(), shell_offs_pack.data(), - shell_offs_device_buffer, *master_stream, - "send_shell_offs_buffer" ); -// std::cout << "Element size " << sizeof(std::get<0>(submat_cut_pack[0]) << std::endl; - copy_rev( 3 * submat_cut_pack.size(), submat_cut_pack.data()->data(), - submat_cut_device_buffer, *master_stream, - "send_submat_cut_buffer" ); - copy_rev( submat_block_pack.size(), submat_block_pack.data(), - submat_block_device_buffer, *master_stream, - "send_submat_block_buffer" ); - - copy_rev( tasks_device.size(), tasks_device.data(), device_tasks, - *master_stream, "send_tasks_device" ); - - - copy_rev( dmat_array.size(), dmat_array.data(), dmat_array_device, - *master_stream, "send dmat_array" ); - copy_rev( zmat_array.size(), zmat_array.data(), zmat_array_device, - *master_stream, "send zmat_array" ); - copy_rev( bf_array.size(), bf_array.data(), bf_array_device, - *master_stream, "send bf_array" ); - - copy_rev( m_array.size(), m_array.data(), m_array_device, - *master_stream, "send m_array" ); - copy_rev( n_array.size(), n_array.data(), n_array_device, - *master_stream, "send n_array" ); - copy_rev( k_array.size(), k_array.data(), k_array_device, - *master_stream, "send k_array" ); - - copy_rev( lda_array.size(), lda_array.data(), lda_array_device, - *master_stream, "send lda_array" ); - copy_rev( ldb_array.size(), ldb_array.data(), ldb_array_device, - *master_stream, "send ldb_array" ); - copy_rev( ldc_array.size(), ldc_array.data(), ldc_array_device, - *master_stream, "send ldc_array" ); - - copy_rev( iparent_pack.size(), iparent_pack.data(), - iparent_device_buffer, *master_stream, "send iparent" ); - copy_rev( dist_nearest_pack.size(), dist_nearest_pack.data(), - dist_nearest_buffer, *master_stream, "send dist_nearest" ); - - } catch(...) { - //teardown_(); throw; - throw; - } - - - // To avoid packed vectors going out of scope - cudaStreamSynchronize( *master_stream ); - - return std::make_tuple(task_it, tasks_device); -} - - -// Explicit Instantiations -template class XCCudaData; - -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/cuda/xc_cuda_data.hpp b/third_party/gauxc/attic/src/new_integrator/device/cuda/xc_cuda_data.hpp deleted file mode 100644 index 8f717c5..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/cuda/xc_cuda_data.hpp +++ /dev/null @@ -1,129 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "device/xc_device_data.hpp" - -#ifdef GAUXC_ENABLE_CUDA - -namespace GauXC { - -template -class XCCudaData : public XCDeviceData { -public: - - size_t nshells = 0; - size_t nbf = 0; - size_t n_deriv = 0; - size_t natoms = 0; - size_t LDatoms = 0; - - bool batch_l3_blas = true; - - void* device_ptr = nullptr; - void* dynmem_ptr = nullptr; - size_t devmem_sz = 0; - size_t dynmem_sz = 0; - - Shell* shells_device = nullptr; - Shell* important_shells_device = nullptr; - - F* vxc_device = nullptr; - F* nbe_scr_device = nullptr; - F* dmat_device = nullptr; - F* zmat_device = nullptr; - F* bf_eval_device = nullptr; - - F* dbf_x_eval_device = nullptr; - F* dbf_y_eval_device = nullptr; - F* dbf_z_eval_device = nullptr; - - F* den_eval_device = nullptr; - F* den_x_eval_device = nullptr; - F* den_y_eval_device = nullptr; - F* den_z_eval_device = nullptr; - F* eps_eval_device = nullptr; - F* gamma_eval_device = nullptr; - - F* vrho_eval_device = nullptr; - F* vgamma_eval_device = nullptr; - - - F* exc_device = nullptr; - F* nel_device = nullptr; - F* acc_scr_device = nullptr; - - F* rab_device = nullptr; - F* coords_device = nullptr; - - F** dmat_array_device = nullptr; - F** zmat_array_device = nullptr; - F** bf_array_device = nullptr; - - int* m_array_device = nullptr; - int* n_array_device = nullptr; - int* k_array_device = nullptr; - int* lda_array_device = nullptr; - int* ldb_array_device = nullptr; - int* ldc_array_device = nullptr; - - F* dist_scratch_device = nullptr; - - // Buffer Vars - F* points_device_buffer = nullptr; - F* weights_device_buffer = nullptr; - size_t* shell_list_device_buffer = nullptr; - size_t* shell_offs_device_buffer = nullptr; - int32_t* submat_cut_device_buffer = nullptr; - int32_t* submat_block_device_buffer = nullptr; - int32_t* iparent_device_buffer = nullptr; - F* dist_nearest_buffer = nullptr; - - cuda::XCTaskDevice* device_tasks = nullptr; - - // Execution management - std::unique_ptr master_stream = nullptr; - std::unique_ptr master_handle = nullptr; - -#ifdef GAUXC_ENABLE_MAGMA - std::unique_ptr master_magma_queue = nullptr; -#endif - - std::vector blas_streams; - std::vector blas_handles; - - XCCudaData( bool _batch_l3_blas = true ); - - ~XCCudaData() noexcept; - XCCudaData( const XCCudaData& ) = delete; - XCCudaData( XCCudaData&& ) noexcept = delete; - - - using task_iterator = std::vector< XCTask >::iterator; - using device_task_container = std::vector< cuda::XCTaskDevice >; - - - void allocate_static_data( size_t _natoms, - size_t _n_deriv, - size_t _nbf, - size_t _nshells ) override; - - - std::tuple< task_iterator, device_task_container > - generate_buffers( const BasisSet& basis, - task_iterator task_begin, - task_iterator task_end ); - -}; - -} - -#endif diff --git a/third_party/gauxc/attic/src/new_integrator/device/gauxc-device.cmake b/third_party/gauxc/attic/src/new_integrator/device/gauxc-device.cmake deleted file mode 100644 index fd0219e..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/gauxc-device.cmake +++ /dev/null @@ -1,17 +0,0 @@ -target_sources( gauxc PRIVATE - # Drivers - device/local_work_replicated_shellbatched_exc_vxc.cxx - - # Interfaces - device/incore_xc_device_integrator.cxx - device/shellbatched_xc_device_integrator.cxx -) - -if( GAUXC_ENABLE_CUDA ) - include( device/cuda/gauxc-cuda.cmake ) -endif() - - -if( GAUXC_ENABLE_HIP ) - include( device/hip/gauxc-hip.cmake ) -endif() diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_angular_cartesian.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_angular_cartesian.hpp deleted file mode 100644 index 5411d7b..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_angular_cartesian.hpp +++ /dev/null @@ -1,308 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace hip { - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_0( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_0_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x; - - eval_y[npts * 0] = bf_y; - - eval_z[npts * 0] = bf_z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_1( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x; - eval[npts * 1] = bf*y; - eval[npts * 2] = bf*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_1_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf + bf_x*x; - eval_x[npts * 1] = bf_x*y; - eval_x[npts * 2] = bf_x*z; - - eval_y[npts * 0] = bf_y*x; - eval_y[npts * 1] = bf + bf_y*y; - eval_y[npts * 2] = bf_y*z; - - eval_z[npts * 0] = bf_z*x; - eval_z[npts * 1] = bf_z*y; - eval_z[npts * 2] = bf + bf_z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_2( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x*x; - eval[npts * 1] = bf*x*y; - eval[npts * 2] = bf*x*z; - eval[npts * 3] = bf*y*y; - eval[npts * 4] = bf*y*z; - eval[npts * 5] = bf*z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_2_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = x*(2*bf + bf_x*x); - eval_x[npts * 1] = y*(bf + bf_x*x); - eval_x[npts * 2] = z*(bf + bf_x*x); - eval_x[npts * 3] = bf_x*y*y; - eval_x[npts * 4] = bf_x*y*z; - eval_x[npts * 5] = bf_x*z*z; - - eval_y[npts * 0] = bf_y*x*x; - eval_y[npts * 1] = x*(bf + bf_y*y); - eval_y[npts * 2] = bf_y*x*z; - eval_y[npts * 3] = y*(2*bf + bf_y*y); - eval_y[npts * 4] = z*(bf + bf_y*y); - eval_y[npts * 5] = bf_y*z*z; - - eval_z[npts * 0] = bf_z*x*x; - eval_z[npts * 1] = bf_z*x*y; - eval_z[npts * 2] = x*(bf + bf_z*z); - eval_z[npts * 3] = bf_z*y*y; - eval_z[npts * 4] = y*(bf + bf_z*z); - eval_z[npts * 5] = z*(2*bf + bf_z*z); - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*x*x*x; - eval[npts * 1] = bf*x*x*y; - eval[npts * 2] = bf*x*x*z; - eval[npts * 3] = bf*x*y*y; - eval[npts * 4] = bf*x*y*z; - eval[npts * 5] = bf*x*z*z; - eval[npts * 6] = bf*y*y*y; - eval[npts * 7] = bf*y*y*z; - eval[npts * 8] = bf*y*z*z; - eval[npts * 9] = bf*z*z*z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = x*x*(3*bf + bf_x*x); - eval_x[npts * 1] = x*y*(2*bf + bf_x*x); - eval_x[npts * 2] = x*z*(2*bf + bf_x*x); - eval_x[npts * 3] = y*y*(bf + bf_x*x); - eval_x[npts * 4] = y*z*(bf + bf_x*x); - eval_x[npts * 5] = z*z*(bf + bf_x*x); - eval_x[npts * 6] = bf_x*y*y*y; - eval_x[npts * 7] = bf_x*y*y*z; - eval_x[npts * 8] = bf_x*y*z*z; - eval_x[npts * 9] = bf_x*z*z*z; - - eval_y[npts * 0] = bf_y*x*x*x; - eval_y[npts * 1] = x*x*(bf + bf_y*y); - eval_y[npts * 2] = bf_y*x*x*z; - eval_y[npts * 3] = x*y*(2*bf + bf_y*y); - eval_y[npts * 4] = x*z*(bf + bf_y*y); - eval_y[npts * 5] = bf_y*x*z*z; - eval_y[npts * 6] = y*y*(3*bf + bf_y*y); - eval_y[npts * 7] = y*z*(2*bf + bf_y*y); - eval_y[npts * 8] = z*z*(bf + bf_y*y); - eval_y[npts * 9] = bf_y*z*z*z; - - eval_z[npts * 0] = bf_z*x*x*x; - eval_z[npts * 1] = bf_z*x*x*y; - eval_z[npts * 2] = x*x*(bf + bf_z*z); - eval_z[npts * 3] = bf_z*x*y*y; - eval_z[npts * 4] = x*y*(bf + bf_z*z); - eval_z[npts * 5] = x*z*(2*bf + bf_z*z); - eval_z[npts * 6] = bf_z*y*y*y; - eval_z[npts * 7] = y*y*(bf + bf_z*z); - eval_z[npts * 8] = y*z*(2*bf + bf_z*z); - eval_z[npts * 9] = z*z*(3*bf + bf_z*z); - -} - - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - if( l == 0 ) { - - collocation_cartesian_angular_0( npts, bf, x, y, z, eval ); - - } else if( l == 1 ) { - - collocation_cartesian_angular_1( npts, bf, x, y, z, eval ); - - } else if( l == 2 ) { - - collocation_cartesian_angular_2( npts, bf, x, y, z, eval ); - - } else if( l == 3 ) { - - collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_cartesian_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - - if( l == 0 ) { - - collocation_cartesian_angular_0( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_0_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 1 ) { - - collocation_cartesian_angular_1( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_1_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 2 ) { - - collocation_cartesian_angular_2( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_2_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 3 ) { - - collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); - collocation_cartesian_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_cartesian_angular_deriv1 - - -} // namespace hip -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_angular_spherical_unnorm.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_angular_spherical_unnorm.hpp deleted file mode 100644 index 0c0c286..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_angular_spherical_unnorm.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#pragma once -#include "collocation_device_constants.hpp" -#include - -#ifndef GPGAUEVAL_INLINE -# define GPGAUEVAL_INLINE __noinline__ -#endif - -namespace GauXC { -namespace integrator { -namespace hip { - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_0( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_0_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x; - - eval_y[npts * 0] = bf_y; - - eval_z[npts * 0] = bf_z; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_1( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = bf*y; - eval[npts * 1] = bf*z; - eval[npts * 2] = bf*x; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_1_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = bf_x*y; - eval_x[npts * 1] = bf_x*z; - eval_x[npts * 2] = bf + bf_x*x; - - eval_y[npts * 0] = bf + bf_y*y; - eval_y[npts * 1] = bf_y*z; - eval_y[npts * 2] = bf_y*x; - - eval_z[npts * 0] = bf_z*y; - eval_z[npts * 1] = bf + bf_z*z; - eval_z[npts * 2] = bf_z*x; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_2( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = sqrt_3*bf*x*y; - eval[npts * 1] = sqrt_3*bf*y*z; - eval[npts * 2] = bf*(-x*x - y*y + 2*z*z)/2; - eval[npts * 3] = sqrt_3*bf*x*z; - eval[npts * 4] = sqrt_3*bf*(x*x - y*y)/2; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_2_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = sqrt_3*y*(bf + bf_x*x); - eval_x[npts * 1] = sqrt_3*bf_x*y*z; - eval_x[npts * 2] = -bf*x - bf_x*(x*x + y*y - 2*z*z)/2; - eval_x[npts * 3] = sqrt_3*z*(bf + bf_x*x); - eval_x[npts * 4] = sqrt_3*(bf*x + bf_x*(x*x - y*y)/2); - - eval_y[npts * 0] = sqrt_3*x*(bf + bf_y*y); - eval_y[npts * 1] = sqrt_3*z*(bf + bf_y*y); - eval_y[npts * 2] = -bf*y - bf_y*(x*x + y*y - 2*z*z)/2; - eval_y[npts * 3] = sqrt_3*bf_y*x*z; - eval_y[npts * 4] = sqrt_3*(-bf*y + bf_y*(x*x - y*y)/2); - - eval_z[npts * 0] = sqrt_3*bf_z*x*y; - eval_z[npts * 1] = sqrt_3*y*(bf + bf_z*z); - eval_z[npts * 2] = 2*bf*z - bf_z*(x*x + y*y - 2*z*z)/2; - eval_z[npts * 3] = sqrt_3*x*(bf + bf_z*z); - eval_z[npts * 4] = sqrt_3*bf_z*(x*x - y*y)/2; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3( - int32_t npts, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - eval[npts * 0] = sqrt_10*bf*y*(3*x*x - y*y)/4; - eval[npts * 1] = sqrt_15*bf*x*y*z; - eval[npts * 2] = sqrt_6*bf*y*(-x*x - y*y + 4*z*z)/4; - eval[npts * 3] = bf*z*(-3*x*x - 3*y*y + 2*z*z)/2; - eval[npts * 4] = sqrt_6*bf*x*(-x*x - y*y + 4*z*z)/4; - eval[npts * 5] = sqrt_15*bf*z*(x*x - y*y)/2; - eval[npts * 6] = sqrt_10*bf*x*(x*x - 3*y*y)/4; - -} - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( - const int32_t npts, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - eval_x[npts * 0] = sqrt_10*y*(6*bf*x + bf_x*(3*x*x - y*y))/4; - eval_x[npts * 1] = sqrt_15*y*z*(bf + bf_x*x); - eval_x[npts * 2] = -sqrt_6*y*(2*bf*x + bf_x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 3] = -z*(6*bf*x + bf_x*(3*x*x + 3*y*y - 2*z*z))/2; - eval_x[npts * 4] = -sqrt_6*(bf*(3*x*x + y*y - 4*z*z) + bf_x*x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 5] = sqrt_15*z*(2*bf*x + bf_x*(x*x - y*y))/2; - eval_x[npts * 6] = sqrt_10*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; - - eval_y[npts * 0] = sqrt_10*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; - eval_y[npts * 1] = sqrt_15*x*z*(bf + bf_y*y); - eval_y[npts * 2] = -sqrt_6*(bf*(x*x + 3*y*y - 4*z*z) + bf_y*y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 3] = -z*(6*bf*y + bf_y*(3*x*x + 3*y*y - 2*z*z))/2; - eval_y[npts * 4] = -sqrt_6*x*(2*bf*y + bf_y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 5] = sqrt_15*z*(-2*bf*y + bf_y*(x*x - y*y))/2; - eval_y[npts * 6] = sqrt_10*x*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; - - eval_z[npts * 0] = sqrt_10*bf_z*y*(3*x*x - y*y)/4; - eval_z[npts * 1] = sqrt_15*x*y*(bf + bf_z*z); - eval_z[npts * 2] = sqrt_6*y*(8*bf*z - bf_z*(x*x + y*y - 4*z*z))/4; - eval_z[npts * 3] = -3*bf*(x*x + y*y - 2*z*z)/2 - bf_z*z*(3*x*x + 3*y*y - 2*z*z)/2; - eval_z[npts * 4] = sqrt_6*x*(8*bf*z - bf_z*(x*x + y*y - 4*z*z))/4; - eval_z[npts * 5] = sqrt_15*(bf + bf_z*z)*(x*x - y*y)/2; - eval_z[npts * 6] = sqrt_10*bf_z*x*(x*x - 3*y*y)/4; - -} - - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( - const int32_t npts, - const int32_t l, - const T bf, - const T x, - const T y, - const T z, - T* __restrict__ eval -) { - - if( l == 0 ) { - - collocation_spherical_unnorm_angular_0( npts, bf, x, y, z, eval ); - - } else if( l == 1 ) { - - collocation_spherical_unnorm_angular_1( npts, bf, x, y, z, eval ); - - } else if( l == 2 ) { - - collocation_spherical_unnorm_angular_2( npts, bf, x, y, z, eval ); - - } else if( l == 3 ) { - - collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_spherical_unnorm_angular - - -template -GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( - const int32_t npts, - const int32_t l, - const T bf, - const T bf_x, - const T bf_y, - const T bf_z, - const T x, - const T y, - const T z, - T* __restrict__ eval, - T* __restrict__ eval_x, - T* __restrict__ eval_y, - T* __restrict__ eval_z -) { - - - if( l == 0 ) { - - collocation_spherical_unnorm_angular_0( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_0_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 1 ) { - - collocation_spherical_unnorm_angular_1( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_1_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 2 ) { - - collocation_spherical_unnorm_angular_2( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_2_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else if( l == 3 ) { - - collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); - collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); - - } else { - assert( false && "L < L_MAX" ); - } - -} // collocation_spherical_unnorm_angular_deriv1 - - -} // namespace hip -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_device_constants.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_device_constants.hpp deleted file mode 100644 index a8e43f9..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_device_constants.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -namespace GauXC { -namespace integrator { -namespace hip { - - constexpr double sqrt_15 = 3.872983346207417; - constexpr double sqrt_3 = 1.7320508075688772; - constexpr double sqrt_6 = 2.449489742783178; - constexpr double sqrt_10 = 3.1622776601683795; - -} // namespace hip -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_radial.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_radial.hpp deleted file mode 100644 index 4ed152c..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation/collocation_radial.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include - -#include - - -namespace GauXC { -namespace integrator { -namespace hip { - -__inline__ __device__ void collocation_device_radial_eval( - const Shell& shell, - const double* pt, - double* x, - double* y, - double* z, - double* eval_device -) { - - const auto* O = shell.O_data(); - const auto* alpha = shell.alpha_data(); - const auto* coeff = shell.coeff_data(); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - *x = xc; - *y = yc; - *z = zc; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - double tmp = 0.; - for( uint32_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - *eval_device = tmp; - -} - - - -__inline__ __device__ void collocation_device_radial_eval_deriv1( - const Shell& shell, - const double* pt, - double* x, - double* y, - double* z, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z -) { - - const auto* O = shell.O_data(); - const auto* alpha = shell.alpha_data(); - const auto* coeff = shell.coeff_data(); - - const double xc = pt[0] - O[0]; - const double yc = pt[1] - O[1]; - const double zc = pt[2] - O[2]; - *x = xc; - *y = yc; - *z = zc; - - const double rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - double tmp = 0.; - double tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( uint32_t i = 0; i < nprim; ++i ) { - - const double a = alpha[i]; - const double e = coeff[i] * std::exp( - a * rsq ); - - const double ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - *eval_device = tmp; - *deval_device_x = tmp_x; - *deval_device_y = tmp_y; - *deval_device_z = tmp_z; - -} - -} // namespace hip -} // namespace integrator -} // namespace GauXC - - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_device.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_device.hip deleted file mode 100644 index 43d210a..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_device.hip +++ /dev/null @@ -1,367 +0,0 @@ -#include "hip/hip_runtime.h" -#include -#include -#include "exceptions/hip_exception.hpp" -#include - -#include "collocation_petite_kernels.hpp" -#include "collocation_masked_kernels.hpp" -#include "collocation_petite_combined_kernels.hpp" -#include "collocation_masked_combined_kernels.hpp" - -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - hipStream_t stream -) { - - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_petite_kernel), dim3(blocks), dim3(threads), 0, stream, nshells, nbf, npts, shells_device, offs_device, - pts_device, eval_device ); - -} - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - hipStream_t stream -); - - - - - - - - - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - hipStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_masked_kernel), dim3(blocks), dim3(threads), 0, stream, nshells, nbf, npts, shells_device, mask_device, - offs_device, pts_device, eval_device ); - -} - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - hipStream_t stream -); - - - - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - hipStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_petite_combined_kernel), dim3(blocks), dim3(threads), 0, stream, ntasks, device_tasks ); - -} - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - hipStream_t stream -); - - - - - - - - - - - - - - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - hipStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_masked_combined_kernel), dim3(blocks), dim3(threads), 0, stream, ntasks, shells_device, device_tasks ); - -} - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - hipStream_t stream -); - - - - - - - - - - - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - hipStream_t stream -) { - - auto nmax_threads = util::hip_kernel_max_threads_per_block( - collocation_device_petite_kernel_deriv1 - ); - - dim3 threads(warp_size, nmax_threads/warp_size, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_petite_kernel_deriv1), dim3(blocks), dim3(threads), 0, stream, nshells, nbf, npts, shells_device, offs_device, - pts_device, eval_device, deval_device_x, deval_device_y, - deval_device_z ); - -} - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z, - hipStream_t stream -); - - - - - - - - - - - - - - - - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - hipStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts, threads.x ), - util::div_ceil( nshells, threads.y ) ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_masked_kernel_deriv1), dim3(blocks), dim3(threads), 0, stream, nshells, nbf, npts, shells_device, mask_device, offs_device, - pts_device, eval_device, deval_device_x, deval_device_y, - deval_device_z ); - -} - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const double* pts_device, - double* eval_device, - double* deval_device_x, - double* deval_device_y, - double* deval_device_z, - hipStream_t stream -); - - - - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - hipStream_t stream -) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_petite_combined_kernel_deriv1), dim3(blocks), dim3(threads), 0, stream, ntasks, device_tasks ); - -} - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - hipStream_t stream -); - - - - - - - - - - - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - hipStream_t stream -) { - - auto nmax_threads = util::hip_kernel_max_threads_per_block( - collocation_device_masked_combined_kernel_deriv1 - ); - - dim3 threads(warp_size, 4, 1); - dim3 blocks( util::div_ceil( npts_max, threads.x ), - util::div_ceil( nshells_max, threads.y ), - ntasks ); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(collocation_device_masked_combined_kernel_deriv1), dim3(blocks), dim3(threads), 0, stream, ntasks, shells_device, device_tasks ); - -} - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - hipStream_t stream -); - - - - - - - - - - - - - -} // namespace hip -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_device.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_device.hpp deleted file mode 100644 index f599c5d..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_device.hpp +++ /dev/null @@ -1,109 +0,0 @@ -#pragma once -#include -#include - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -void eval_collocation_petite( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - hipStream_t stream -); - -template -void eval_collocation_masked( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - hipStream_t stream -); - -template -void eval_collocation_petite_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - hipStream_t stream -); - -template -void eval_collocation_masked_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* shells_device, - const size_t* mask_device, - const size_t* offs_device, - const T* pts_device, - T* eval_device, - T* deval_device_x, - T* deval_device_y, - T* deval_device_z, - hipStream_t stream -); - -template -void eval_collocation_petite_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - hipStream_t stream -); - -template -void eval_collocation_masked_combined( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - hipStream_t stream -); - - - -template -void eval_collocation_petite_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - XCTaskDevice* device_tasks, - hipStream_t stream -); - -template -void eval_collocation_masked_combined_deriv1( - size_t ntasks, - size_t npts_max, - size_t nshells_max, - Shell* shells_device, - XCTaskDevice* device_tasks, - hipStream_t stream -); - -} // namespace hip -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_masked_combined_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_masked_combined_kernels.hpp deleted file mode 100644 index ff0e3a0..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_masked_combined_kernels.hpp +++ /dev/null @@ -1,186 +0,0 @@ -#include "hip/hip_runtime.h" -#include -#include - -#include -#include - -#include "device/hip/collocation/collocation_angular_cartesian.hpp" -#include "device/hip/collocation/collocation_angular_spherical_unnorm.hpp" -#include "device/hip/hip_alg_variant_control.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_masked_combined_kernel( - size_t ntasks, - Shell* __restrict__ shells_device, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( blockIdx.z < ntasks ) { - - auto& task = device_tasks[ blockIdx.z ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ mask_device = task.shell_list; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - - if( tid_x < npts and tid_y < nshells ) { - - const uint32_t ipt = tid_x; - const uint32_t ish = tid_y; - const uint32_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - auto tmp = 0.; - for( uint32_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } // shell / point idx check - - } // Batch idx check - -} - - - - - - - - - - - - - - - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_masked_combined_kernel_deriv1( - size_t ntasks, - Shell* __restrict__ shells_device, - XCTaskDevice* __restrict__ device_tasks -) { - - // DBWY: These are factored into the loop for this optimization - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( blockIdx.z < ntasks ) { - - auto& task = device_tasks[ blockIdx.z ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ mask_device = task.shell_list; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - auto* __restrict__ deval_device_x = task.dbfx; - auto* __restrict__ deval_device_y = task.dbfy; - auto* __restrict__ deval_device_z = task.dbfz; - - if( tid_y < nshells and tid_x < npts ) { - - const uint32_t ish = tid_y; - const uint32_t ipt = tid_x; - const uint32_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const uint32_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( uint32_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, - tmp_z, xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } // shell / point idx check - } // Batch idx check - - -} - -} // namespace hip -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_masked_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_masked_kernels.hpp deleted file mode 100644 index 0105571..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_masked_kernels.hpp +++ /dev/null @@ -1,158 +0,0 @@ -#include "hip/hip_runtime.h" -#include -#include - -#include - -#include "device/hip/collocation/collocation_angular_cartesian.hpp" -#include "device/hip/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_masked_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ mask_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } - -} - - - - - - - - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_masked_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ mask_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device, - T* __restrict__ deval_device_x, - T* __restrict__ deval_device_y, - T* __restrict__ deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[mask_device[ish]]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } - - -} - -} // namespace hip -} // namespace integrator -} // namespace GauXC - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_petite_combined_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_petite_combined_kernels.hpp deleted file mode 100644 index bcf2d25..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_petite_combined_kernels.hpp +++ /dev/null @@ -1,189 +0,0 @@ -#include "hip/hip_runtime.h" -#include -#include - -#include -#include - -#include "device/hip/collocation/collocation_angular_cartesian.hpp" -#include "device/hip/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_petite_combined_kernel( - size_t ntasks, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - const int batch_id = blockIdx.z; - - if( batch_id < ntasks ) { - - auto& task = device_tasks[ batch_id ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ shells_device = task.shells; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* eval_device = task.bf; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } // shell / point idx check - - } // Batch idx check - -} - - - - - - - - - - - - - - - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_petite_combined_kernel_deriv1( - size_t ntasks, - XCTaskDevice* __restrict__ device_tasks -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - const int batch_id = blockIdx.z; - - if( batch_id < ntasks ) { - - auto& task = device_tasks[ batch_id ]; - - const auto nshells = task.nshells; - const auto nbf = task.nbe; - const auto npts = task.npts; - const auto* __restrict__ shells_device = task.shells; - const auto* __restrict__ pts_device = task.points; - const auto* __restrict__ offs_device = task.shell_offs; - - auto* __restrict__ eval_device = task.bf; - auto* __restrict__ deval_device_x = task.dbfx; - auto* __restrict__ deval_device_y = task.dbfy; - auto* __restrict__ deval_device_z = task.dbfz; - - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, - tmp_z, xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } // shell / point idx check - - } // Batch idx check - - -} - -} // namespace hip -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_petite_kernels.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_petite_kernels.hpp deleted file mode 100644 index bd3bb80..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/collocation_petite_kernels.hpp +++ /dev/null @@ -1,163 +0,0 @@ -#include "hip/hip_runtime.h" -#include -#include - -#include - -#include "device/hip/collocation/collocation_angular_cartesian.hpp" -#include "device/hip/collocation/collocation_angular_spherical_unnorm.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - - - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_petite_kernel( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - for( size_t i = 0; i < nprim; ++i ) - tmp += coeff[i] * std::exp( - alpha[i] * rsq ); - - auto * bf_eval = eval_device + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - else - collocation_cartesian_angular( npts, shell.l(), tmp, xc, yc, zc, bf_eval ); - - } - -} - - - - - - - - - - - - - - - -template -__global__ -__launch_bounds__(1024,1) -void collocation_device_petite_kernel_deriv1( - size_t nshells, - size_t nbf, - size_t npts, - const Shell* __restrict__ shells_device, - const size_t* __restrict__ offs_device, - const T* __restrict__ pts_device, - T* __restrict__ eval_device, - T* __restrict__ deval_device_x, - T* __restrict__ deval_device_y, - T* __restrict__ deval_device_z -) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nshells ) { - - const size_t ipt = tid_x; - const size_t ish = tid_y; - - const size_t ibf = offs_device[ish]; - - const auto& shell = shells_device[ish]; - const auto* pt = pts_device + 3*ipt; - - - const auto* __restrict__ O = shell.O_data(); - const auto* __restrict__ alpha = shell.alpha_data(); - const auto* __restrict__ coeff = shell.coeff_data(); - - const auto xc = pt[0] - O[0]; - const auto yc = pt[1] - O[1]; - const auto zc = pt[2] - O[2]; - - const auto rsq = xc*xc + yc*yc + zc*zc; - - const size_t nprim = shell.nprim(); - auto tmp = 0.; - auto tmp_x = 0., tmp_y = 0., tmp_z = 0.; - for( size_t i = 0; i < nprim; ++i ) { - - const auto a = alpha[i]; - const auto e = coeff[i] * std::exp( - a * rsq ); - - const auto ae = 2. * a * e; - - tmp += e; - tmp_x -= ae * xc; - tmp_y -= ae * yc; - tmp_z -= ae * zc; - - } - - auto * bf_eval = eval_device + ibf*npts + ipt; - auto * dx_eval = deval_device_x + ibf*npts + ipt; - auto * dy_eval = deval_device_y + ibf*npts + ipt; - auto * dz_eval = deval_device_z + ibf*npts + ipt; - - const bool do_sph = shell.pure(); - if( do_sph ) - collocation_spherical_unnorm_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - else - collocation_cartesian_angular_deriv1( npts, shell.l(), tmp, tmp_x, tmp_y, tmp_z, - xc, yc, zc, bf_eval, dx_eval, - dy_eval, dz_eval ); - - } - - -} - -} // namespace hip -} // namespace integrator -} // namespace GauXC diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/gauxc-hip.cmake b/third_party/gauxc/attic/src/new_integrator/device/hip/gauxc-hip.cmake deleted file mode 100644 index 04ed400..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/gauxc-hip.cmake +++ /dev/null @@ -1,43 +0,0 @@ -find_package( hipblas REQUIRED ) -#include( gauxc-cub ) - -target_sources( gauxc PRIVATE - # Common HIP Utilities - device/hip/collocation_device.hip - device/hip/xc_hip_data.cxx - device/hip/hip_weights.hip - device/hip/hip_pack_density.hip - device/hip/hip_eval_denvars.hip - device/hip/hipblas_extensions.hip - device/hip/hip_inc_potential.hip - device/hip/hip_device_properties.cxx - - # XC Specific - device/hip/hip_zmat.hip - - # Drivers - device/hip/local_work_replicated_incore_exc_vxc.cxx - -) - -#target_compile_features( gauxc PRIVATE hip_std_14 ) -#target_compile_options( gauxc -# PRIVATE -# $<$: -Xhipfe --diag_suppress=partial_override -Xptxas -v > -#) - - -if( GAUXC_ENABLE_MAGMA ) - - message( STATUS "MAGMA Has Been Enabled" ) - find_package( MAGMA REQUIRED ) - target_link_libraries( gauxc PUBLIC MAGMA::magma ) - -else() - - message( STATUS "MAGMA Has Been Explicitly Disabled" ) - -endif() - -target_link_libraries( gauxc PUBLIC roc::hipblas ) -#target_link_libraries( gauxc PRIVATE $ ) diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_alg_variant_control.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_alg_variant_control.hpp deleted file mode 100644 index 6b97465..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_alg_variant_control.hpp +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -//#define GAUXC_HIP_ENABLE_COLLOCATION_SHMEM_COPY -//#define GAUXC_HIP_ENABLE_COMPACT_COLLOCATION diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_device_properties.cxx b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_device_properties.cxx deleted file mode 100644 index 9789e70..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_device_properties.cxx +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "hip_runtime.h" - -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace hip { - - -uint32_t get_submat_cut_block(int32_t LDA, int32_t device) { - int l2_cache_size; - hipDeviceGetAttribute(&l2_cache_size, hipDevAttrL2CacheSize, device); - - int l2_block_size = (int) sqrt(0.75 * ((double) l2_cache_size / 8)); - int min_block_size = LDA / max_submat_blocks; - - int block_size = std::max(l2_block_size, min_block_size); - block_size = std::min(block_size, LDA); - - return block_size; -} - -uint32_t get_device_sm_count(int32_t device) { - int num_sm; - hipDeviceGetAttribute(&num_sm, hipDevAttrMultiProcessorCount, device); - - return num_sm; -} - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_device_properties.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_device_properties.hip deleted file mode 100644 index 3cb9caf..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_device_properties.hip +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "hip/hip_runtime.h" - -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace hip { - - -uint32_t get_submat_cut_block(int32_t LDA, int32_t device) { - int l2_cache_size; - hipDeviceGetAttribute(&l2_cache_size, hipDeviceAttributeL2CacheSize, device); - - int l2_block_size = (int) sqrt(0.75 * ((double) l2_cache_size / 8)); - int min_block_size = LDA / max_submat_blocks; - - int block_size = std::max(l2_block_size, min_block_size); - block_size = std::min(block_size, LDA); - - return block_size; -} - -uint32_t get_device_sm_count(int32_t device) { - int num_sm; - hipDeviceGetAttribute(&num_sm, hipDeviceAttributeMultiprocessorCount, device); - - return num_sm; -} - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_eval_denvars.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_eval_denvars.hip deleted file mode 100644 index 968e922..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_eval_denvars.hip +++ /dev/null @@ -1,283 +0,0 @@ -#include "hip/hip_runtime.h" -#include "hip_eval_denvars.hpp" -#include "hip_extensions.hpp" -#include - -#include "hip_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { -using namespace GauXC::hip; - -template -__global__ void eval_uvars_lda_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.nbe; - - auto* den_eval_device = task.den; - - const auto* basis_eval_device = task.bf; - - const auto* den_basis_prod_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - double den_reg = 0.; - - if( tid_x < nbf and tid_y < npts ) { - - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; - - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - - } - - // Warp blocks are stored col major - den_reg = 2 * warpReduceSum( den_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - } - - -} - -template -__global__ void eval_uvars_gga_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.nbe; - - auto* den_eval_device = task.den; - auto* den_x_eval_device = task.ddenx; - auto* den_y_eval_device = task.ddeny; - auto* den_z_eval_device = task.ddenz; - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - const auto* den_basis_prod_device = task.zmat; - - - for( int ipt = blockIdx.y * blockDim.y + threadIdx.y; - ipt < npts; - ipt += blockDim.y * gridDim.y ) { - - double den = 0.; - double dx = 0.; - double dy = 0.; - double dz = 0.; - - for( int ibf_st = 0; ibf_st < nbf; ibf_st += warp_size ) { - - double den_reg = 0.; - double dx_reg = 0.; - double dy_reg = 0.; - double dz_reg = 0.; - - int ibf = ibf_st + threadIdx.x; - if( ibf < nbf ) { - const double* bf_col = basis_eval_device + ibf*npts; - const double* bf_x_col = dbasis_x_eval_device + ibf*npts; - const double* bf_y_col = dbasis_y_eval_device + ibf*npts; - const double* bf_z_col = dbasis_z_eval_device + ibf*npts; - const double* db_col = den_basis_prod_device + ibf*npts; - - den_reg = bf_col[ ipt ] * db_col[ ipt ]; - dx_reg = bf_x_col[ ipt ] * db_col[ ipt ]; - dy_reg = bf_y_col[ ipt ] * db_col[ ipt ]; - dz_reg = bf_z_col[ ipt ] * db_col[ ipt ]; - } - - den += 2 * warpReduceSum( den_reg ); - dx += 4 * warpReduceSum( dx_reg ); - dy += 4 * warpReduceSum( dy_reg ); - dz += 4 * warpReduceSum( dz_reg ); - - } - - if( threadIdx.x == 0 ) { - den_eval_device [ipt] = den; - den_x_eval_device [ipt] = dx ; - den_y_eval_device [ipt] = dy ; - den_z_eval_device [ipt] = dz ; - } - //__sync_warp(); - - } - - -/* - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - double den_reg = 0.; - double dx_reg = 0.; - double dy_reg = 0.; - double dz_reg = 0.; - - if( tid_x < nbf and tid_y < npts ) { - - const double* bf_col = basis_eval_device + tid_x*npts; - const double* bf_x_col = dbasis_x_eval_device + tid_x*npts; - const double* bf_y_col = dbasis_y_eval_device + tid_x*npts; - const double* bf_z_col = dbasis_z_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; - - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - dx_reg = bf_x_col[ tid_y ] * db_col[ tid_y ]; - dy_reg = bf_y_col[ tid_y ] * db_col[ tid_y ]; - dz_reg = bf_z_col[ tid_y ] * db_col[ tid_y ]; - - } - - // Warp blocks are stored col major - den_reg = 2 * warpReduceSum( den_reg ); - dx_reg = 4 * warpReduceSum( dx_reg ); - dy_reg = 4 * warpReduceSum( dy_reg ); - dz_reg = 4 * warpReduceSum( dz_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - atomicAdd( den_x_eval_device + tid_y, dx_reg ); - atomicAdd( den_y_eval_device + tid_y, dy_reg ); - atomicAdd( den_z_eval_device + tid_y, dz_reg ); - } -*/ - - -} - - -template -__global__ void eval_vvars_gga_kernel( - size_t npts, - const T* den_x_eval_device, - const T* den_y_eval_device, - const T* den_z_eval_device, - T* gamma_eval_device -) { - - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - if( tid < npts ) { - - const double dx = den_x_eval_device[ tid ]; - const double dy = den_y_eval_device[ tid ]; - const double dz = den_z_eval_device[ tid ]; - - gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz; - - } - -} - - - - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( max_nbf , threads.x ), - util::div_ceil( max_npts , threads.y ), - ntasks ); - - hipLaunchKernelGGL(eval_uvars_lda_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); - -} - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ) { - - dim3 threads(warp_size, max_warps_per_thread_block, 1); - dim3 blocks( 1, 8, ntasks ); - - hipLaunchKernelGGL(eval_uvars_gga_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); - -} - - -template -void eval_vvars_gga_device( size_t npts, - const T* den_x_device, - const T* den_y_device, - const T* den_z_device, - T* gamma_device, - hipStream_t stream ) { - - dim3 threads( max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts, threads.x ) ); - - hipLaunchKernelGGL(eval_vvars_gga_kernel, dim3(blocks), dim3(threads), 0, stream , - npts, den_x_device, den_y_device, den_z_device, gamma_device - ); - -} - - - - - - - - - - - - - - - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - -template -void eval_vvars_gga_device( size_t npts, - const double* den_x_device, - const double* den_y_device, - const double* den_z_device, - double* gamma_device, - hipStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_eval_denvars.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_eval_denvars.hpp deleted file mode 100644 index f5e6634..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_eval_denvars.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -void eval_uvars_lda_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - -template -void eval_uvars_gga_device( size_t ntasks, - size_t max_nbf, - size_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - - -template -void eval_vvars_gga_device( size_t npts, - const T* den_x_device, - const T* den_y_device, - const T* den_z_device, - T* gamma_device, - hipStream_t stream ); - - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_extensions.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_extensions.hpp deleted file mode 100644 index d45ce94..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_extensions.hpp +++ /dev/null @@ -1,110 +0,0 @@ -#include "hip/hip_runtime.h" -#pragma once -#include -#include -#include "device/hip/hip_device_properties.hpp" - -#define GAUXC_ENABLE_WARP_REDUCTIONS - -namespace GauXC { -namespace hip { - -__inline__ __device__ -double warpReduceSum(double val) { - -#ifdef GAUXC_ENABLE_WARP_REDUCTIONS - - for(int i=(warp_size/2); i>=1; i/=2) - val += __shfl_xor_sync(0xffffffff, val, i, warp_size); - -#else - - using warp_reducer = hipcub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage temp_storage[max_warps_per_thread_block]; - int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - int warp_lane = tid / warp_size; - val = warp_reducer( temp_storage[warp_lane] ).Sum( val ); - -#endif - - return val; -} - -__inline__ __device__ -double warpReduceProd(double val) { - for(int i=(warp_size/2); i>=1; i/=2) - val *= __shfl_xor_sync(0xffffffff, val, i, warp_size); - return val; -} - -#if 0 -__inline__ __device__ -double blockReduceSum( double val ) { - - static __shared__ double shared[32]; - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; - - val = warpReduceSum( val ); - - if( lane == 0 ) shared[wid] = val; - - __syncthreads(); - - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0; - if( wid == 0 ) val = warpReduceSum( val ); - - return val; - -} - -template -__inline__ __device__ T warp_prod_reduce( T val ) { - - for( int i = warp_size / 2; i >= 1; i /= 2 ) - val *= __shfl_xor_sync( 0xffffffff, val, i, warp_size ); - - return val; - -} - -template -__inline__ __device__ T block_prod_reduce( T val ) { - - static __shared__ T shared[32]; - const int lane = threadIdx.x % 32; - const int wid = threadIdx.x / 32; - - val = warp_prod_reduce( val ); - - if( lane == 0 ) shared[ wid ] = val; - __syncthreads(); - - val = ( threadIdx.x < blockDim.x / 32 ) ? shared[ lane ] : 0; - if( wid == 0 ) val = warp_prod_reduce( val ); - - return val; - -} - -__inline__ __device__ double atomicMul(double* address, double val) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val * - __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_inc_potential.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_inc_potential.hip deleted file mode 100644 index 51e5ca7..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_inc_potential.hip +++ /dev/null @@ -1,124 +0,0 @@ -#include "hip/hip_runtime.h" -#include "device/hip/hip_inc_potential.hpp" -#include "device/hip/hip_device_properties.hpp" -#include - -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - - -#define WARP_X 16 -#define WARP_Y 1 -#define UNROLL_FACTOR 4 -#define EFF_UNROLL 4 -#define CUT_X 8 -#define CUT_Y 8 - - -template -__global__ __launch_bounds__(1024, 1) -void inc_by_submat_combined_kernel( size_t ntasks, - XCTaskDevice* device_tasks, - T* A, - size_t LDA, - const int block_y, - const int block_x ) { - - const int batch_id = blockIdx.z; - auto& task = device_tasks[ batch_id ]; - - const auto* submat_cut_device = task.submat_cut; - const auto* submat_block_device = task.submat_block; - const auto LDAS = task.nbe; - auto* ASmall_device = task.nbe_scr; - - //if( LDAS == LDAB ) return; - const int tid_xx = threadIdx.x % WARP_X; - const int tid_xy = threadIdx.x / WARP_X; - - const int tid_yx = threadIdx.y % CUT_X; - const int tid_yy = threadIdx.y / CUT_X; - - const int start_cut_y = submat_block_device[block_y]; - const int end_cut_y = submat_block_device[block_y+1]; - const int start_cut_x = submat_block_device[block_x]; - const int end_cut_x = submat_block_device[block_x+1]; - - for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) { - const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut)); - const int i_cut_first = i_data.x; - const int delta_i = i_data.y; - const int i_cut_small = i_data.z; - - for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) { - const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut)); - const int j_cut_first = j_data.x; - const int delta_j = j_data.y; - const int j_cut_small = j_data.z; - - auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS; - auto* ABig_begin = A + i_cut_first + j_cut_first*LDA; - - int J; - for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - - double val[UNROLL_FACTOR]; - double* address[UNROLL_FACTOR]; -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - val[k] = ASmall_begin[I + (J+k*WARP_Y)*LDAS]; - address[k] = ABig_begin + I + (J+k*WARP_Y)*LDA; - } -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - atomicAdd(address[k], val[k] ); - } - } - } - - for ( ; J < delta_j; J += WARP_Y) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - atomicAdd(ABig_begin + I + J*LDA, ASmall_begin[I + J*LDAS] ); - } - } - - } - } -} - - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - T* V_device, - size_t LDV, - hipStream_t stream ) { - dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks); - - const int submat_block_size = get_submat_cut_block(LDV, 0); - for (int i = 0; i < util::div_ceil(LDV, submat_block_size); i++) { - for (int j = 0; j < util::div_ceil(LDV, submat_block_size); j++) { - hipLaunchKernelGGL(inc_by_submat_combined_kernel, dim3(blocks), dim3(threads), 0, stream , - ntasks, device_tasks, V_device, LDV, i, j - ); - } - } -} - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - double* V_device, - size_t LDV, - hipStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_inc_potential.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_inc_potential.hpp deleted file mode 100644 index 508d727..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_inc_potential.hpp +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -void task_inc_potential( size_t ntasks, - XCTaskDevice* device_tasks, - T* V_device, - size_t LDV, - hipStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_pack_density.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_pack_density.hip deleted file mode 100644 index 70fbf05..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_pack_density.hip +++ /dev/null @@ -1,128 +0,0 @@ -#include "hip/hip_runtime.h" -#include "device/hip/hip_pack_density.hpp" -#include "device/hip/hip_device_properties.hpp" -#include - -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -#define WARP_X 16 -#define WARP_Y 1 -#define UNROLL_FACTOR 4 -#define EFF_UNROLL 4 -#define CUT_X 8 -#define CUT_Y 8 - -template -__global__ __launch_bounds__(1024, 1) -void submat_set_combined_kernel( size_t ntasks, - XCTaskDevice* device_tasks, - T* A, - size_t LDA, - const int block_y, - const int block_x) { - - - const int batch_id = blockIdx.z; - auto& task = device_tasks[ batch_id ]; - - const auto* submat_cut_device = task.submat_cut; - const auto* submat_block_device = task.submat_block; - const auto LDAS = task.nbe; - auto* ASmall_device = task.nbe_scr; - - //if( LDAS == LDAB ) return; - - const int tid_xx = threadIdx.x % WARP_X; - const int tid_xy = threadIdx.x / WARP_X; - - const int tid_yx = threadIdx.y % CUT_X; - const int tid_yy = threadIdx.y / CUT_X; - - const int start_cut_y = submat_block_device[block_y]; - const int end_cut_y = submat_block_device[block_y+1]; - const int start_cut_x = submat_block_device[block_x]; - const int end_cut_x = submat_block_device[block_x+1]; - - for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) { - const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut)); - const int i_cut_first = i_data.x; - const int delta_i = i_data.y; - const int i_cut_small = i_data.z; - - for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) { - const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut)); - const int j_cut_first = j_data.x; - const int delta_j = j_data.y; - const int j_cut_small = j_data.z; - - auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS; - auto* ABig_begin = A + i_cut_first + j_cut_first*LDA; - - int J; - for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - - double val[UNROLL_FACTOR]; - double* address[UNROLL_FACTOR]; -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - val[k] = ABig_begin[I + (J + k*WARP_Y)*LDA]; - address[k] = ASmall_begin + I + (J + k*WARP_Y) * LDAS; - } -#pragma unroll - for (int k = 0; k < UNROLL_FACTOR; k++) { - // Suggest that the result be evicted first. -#if (HIPRT_VERSION >= 11000) - __stcs(address[k], val[k]); -#else - asm ("st.global.cs.f64 [%0], %1;" :: "l"(address[k]), "d"(val[k])); -#endif - } - } - } - - for ( ; J < delta_j; J += WARP_Y) { - for( int I = tid_xx; I < delta_i; I += WARP_X ) { - ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA]; - } - } - } - } -} - - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - T* P_device, - size_t LDP, - hipStream_t stream ) { - - dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks); - - const int submat_block_size = get_submat_cut_block(LDP, 0); - for (int i = 0; i < util::div_ceil(LDP, submat_block_size); i++) { - for (int j = 0; j < util::div_ceil(LDP, submat_block_size); j++) { - hipLaunchKernelGGL(submat_set_combined_kernel, dim3(blocks), dim3(threads), 0, stream , - ntasks, device_tasks, P_device, LDP, i, j - ); - } - } -} - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - double* P_device, - size_t LDP, - hipStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_pack_density.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_pack_density.hpp deleted file mode 100644 index a3466a8..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_pack_density.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -void task_pack_density_matrix( size_t ntasks, - XCTaskDevice* device_tasks, - T* P_device, - size_t LDP, - hipStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_weights.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_weights.hip deleted file mode 100644 index 9a47d69..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_weights.hip +++ /dev/null @@ -1,642 +0,0 @@ -#include "hip/hip_runtime.h" -#include - -#include "device/hip/hip_weights.hpp" -#include "common/integrator_constants.hpp" -#include "device/hip/hip_extensions.hpp" -#include "device/hip/hip_device_properties.hpp" - -constexpr double eps_d = std::numeric_limits::epsilon(); - - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -__global__ void reciprocal_kernel(size_t length, double* vec) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < length; i += blockDim.x * gridDim.x) { - vec[i] = 1. / vec[i]; - } -} - -__global__ void compute_point_center_dist( - size_t npts, - size_t LDatoms, - size_t natoms, - const double* coords, - const double* points, - double* dist -) { - - __shared__ double3 point_buffer[warp_size]; - double3 coord_reg; - - const int natoms_block = (natoms + warp_size-1) / warp_size; - const int coords_block = (npts + warp_size-1) / warp_size; - - const double3* coords_vec = (double3*) coords; - const double3* points_vec = (double3*) points; - - for (int j = blockIdx.x; j < natoms_block; j += gridDim.x) { - const int iAtom = j * warp_size + threadIdx.x; - // Load blocks into registers/shared memory - if (iAtom < natoms) { - coord_reg = coords_vec[iAtom]; - } - for (int i = blockIdx.y; i < coords_block; i += gridDim.y) { - const int iPt_load = i * warp_size + threadIdx.x; - if (iPt_load < npts) { - point_buffer[threadIdx.x] = points_vec[iPt_load]; - } - __syncthreads(); - - // do the computation - #pragma unroll 2 - for (int k = threadIdx.y; k < warp_size; k+=warp_size/2) { - const int iPt_sm = k; - const int iPt = i * warp_size + iPt_sm; - const double rx = point_buffer[iPt_sm].x - coord_reg.x; - const double ry = point_buffer[iPt_sm].y - coord_reg.y; - const double rz = point_buffer[iPt_sm].z - coord_reg.z; - - if (iAtom < natoms and iPt < npts) { - dist[ iAtom + iPt * LDatoms ] = std::sqrt( rx*rx + ry*ry + rz*rz ); - } - } - __syncthreads(); - } - } -} - -#if 0 -__global__ void modify_weights_becke_kernel( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - double* weights_device -) { - - // Becke partition functions - auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 - auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 - - - __shared__ double shared[2048]; - for( int ipt = blockIdx.x; ipt < npts; ipt += gridDim.x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - for( int iCenter = threadIdx.y; iCenter < natoms; iCenter += blockDim.y ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = threadIdx.x; jCenter < natoms; jCenter += blockDim.x ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - const double s = 0.5 * ( 1. - gBecke( mu ) ); - - ps *= (iCenter == jCenter) ? 1. : s ; - - } - - ps = warp_prod_reduce( ps ); // XXX: Assumes blockDim.x == 32 - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } - - // XXX: Assumes blockDim.x == blockDim.y == 32 - if( threadIdx.x == 0 ) { - shared[ threadIdx.y ] = sum; - shared[ threadIdx.y + 1024] = parent_weight; - } - - __syncthreads(); - sum = shared[ threadIdx.x ]; - sum = warpReduceSum( sum ); - - __syncthreads(); - parent_weight = shared[ threadIdx.x + 1024]; - parent_weight = __shfl_sync(0xffffffff, parent_weight, iParent % 32, 32 ); - - if( threadIdx.x == 0 and threadIdx.y == 0 ) - weights_device[ipt] *= parent_weight / sum; - - - } - - -} - - - -__global__ void modify_weights_ssf_kernel( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - - // Frisch partition functions - auto gFrisch = [](double x) { - - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - - auto sFrisch = [&] (double x) { - const double g = 0.5 * (1. - gFrisch(x)); - return (x >= magic_ssf_factor<>) ? 0. : (x <= -magic_ssf_factor<>) ? 1. : g; - }; - - constexpr double weight_tol = 1e-10; - - __shared__ double shared[2048]; - for( int ipt = blockIdx.x; ipt < npts; ipt += gridDim.x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - - for( int iCenter = threadIdx.y; iCenter < natoms; iCenter += blockDim.y ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = threadIdx.x; jCenter < natoms; jCenter += blockDim.x ) - if( fabs(ps) > weight_tol ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - const double s = sFrisch( mu ); - ps *= (iCenter == jCenter) ? 1. : s ; - - } - - ps = warp_prod_reduce( ps ); // XXX: Assumes blockDim.x == 32 - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } - - // XXX: Assumes blockDim.x == blockDim.y == 32 - if( threadIdx.x == 0 ) { - shared[ threadIdx.y ] = sum; - shared[ threadIdx.y + 1024] = parent_weight; - } - - __syncthreads(); - sum = shared[ threadIdx.x ]; - sum = warpReduceSum( sum ); - - __syncthreads(); - parent_weight = shared[ threadIdx.x + 1024]; - parent_weight = __shfl_sync(0xffffffff, parent_weight, iParent % 32, 32 ); - - if( threadIdx.x == 0 and threadIdx.y == 0 ) - weights_device[ipt] *= parent_weight / sum; - - - } - - -} -#endif - -// SIMT over points: 1D kernel -__global__ void modify_weights_ssf_kernel_1d( - size_t npts, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - - // Frisch partition functions - auto gFrisch = [](double x) { - - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - -#if 0 - auto sFrisch = [&] (double x) { - const double g = 0.5 * (1. - gFrisch(x)); - return (x >= magic_ssf_factor<>) ? 0. : (x <= -magic_ssf_factor<>) ? 1. : g; - }; -#else - auto sFrisch = [&] (double x) { - if( fabs(x) < magic_ssf_factor<> ) return 0.5 * (1. - gFrisch(x)); - else if( x >= magic_ssf_factor<> ) return 0.; - else return 1.; - }; -#endif - - constexpr double weight_tol = 1e-10; - - const int tid_x = threadIdx.x + blockIdx.x * blockDim.x; - const int nt_x = blockDim.x * gridDim.x; - - //__shared__ double shared[2048]; - for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * natoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - -#if 0 - for( int iCenter = 0; iCenter < natoms; iCenter++ ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( fabs(ps) > weight_tol ) { - if( iCenter != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - ps *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - - if( iCenter == iParent ) parent_weight = ps; - - sum += ps; - - } -#else - - // Do iParent First - { - - const double ri = local_dist_scratch[ iParent ]; - const double* const local_rab = RAB + iParent * natoms; - - parent_weight = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( parent_weight > weight_tol ) { - if( iParent != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - parent_weight *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - sum += parent_weight; - - } - - if( parent_weight < eps_d ) { - weights_device[ipt] = 0.; - continue; - } - - for( int iCenter = 0; iCenter < natoms; iCenter++ ) - if( iParent != iCenter ) { - - const double ri = local_dist_scratch[ iCenter ]; - - const double* const local_rab = RAB + iCenter * natoms; - - double ps = 1.; - for( int jCenter = 0; jCenter < natoms; jCenter++ ) - if( ps > weight_tol ) { - if( iCenter != jCenter ) { - - const double rj = local_dist_scratch[ jCenter ]; - - const double mu = (ri - rj) / local_rab[ jCenter ]; // XXX: RAB is symmetric - ps *= sFrisch( mu ); - - } - } else break; - - //__syncwarp(); - sum += ps; - - } - -#endif - - weights_device[ipt] *= parent_weight / sum; - - - } - - -} - -__device__ __inline__ double gFrisch(double x) { - // Frisch partition functions -// const double s_x = x / magic_ssf_factor<>; - const double s_x = x * 1.5625; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return ((35.) *(s_x - s_x3) + (21.) *s_x5 - (5.) *s_x7); -} - - -__device__ __inline__ double sFrisch(double x) { - //double frisch_val = (0.5 - (0.5/ 16.0) * gFrisch(x)); - - if( fabs(x) < magic_ssf_factor<> ) return (0.5 - (0.5/ 16.0) * gFrisch(x)); - else if( x >= magic_ssf_factor<> ) return 0.; - else return 1.; -} - -__global__ __launch_bounds__(weight_thread_block, weight_thread_block_per_sm) -void modify_weights_ssf_kernel_2d( - size_t npts, - size_t LDatoms, - size_t natoms, - const double* RAB, - const double* coords, - const double* dist_scratch, - const int32_t* iparent_device, - const double* dist_nearest_device, - double* weights_device -) { - constexpr double weight_tol = 1e-10; - int natom_block = ((natoms + blockDim.x - 1) / blockDim.x) * blockDim.x; - - const int tid_x = threadIdx.y + blockIdx.y * blockDim.y; - const int nt_x = blockDim.y * gridDim.y; - - __shared__ int jCounter_sm[max_warps_per_thread_block]; - int* jCounter = reinterpret_cast(jCounter_sm) + threadIdx.y; - - // Each warp will work together on a point - for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { - - const auto iParent = iparent_device[ipt]; - - double sum = 0.; - double parent_weight = 0.; - - const double* const local_dist_scratch = dist_scratch + ipt * LDatoms; - const double dist_cutoff = 0.5 * (1 - magic_ssf_factor<> ) * - dist_nearest_device[ipt]; - if( local_dist_scratch[iParent] < dist_cutoff ) continue; - - // Do iParent First - { - - const double ri = local_dist_scratch[ iParent ]; - const double* const local_rab = RAB + iParent * LDatoms; - - parent_weight = 1.; - for( int jCenter = threadIdx.x; jCenter < natom_block; jCenter+=blockDim.x ) { - double contribution = 1.0; - if (jCenter < natoms && iParent != jCenter) { - const double rj = local_dist_scratch[ jCenter ]; - const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric - contribution = sFrisch( mu ); - } - contribution = warpReduceProd(contribution); - parent_weight *= contribution; - - if (parent_weight < weight_tol) break; - } - } - - if( parent_weight < eps_d ) { - if (threadIdx.x == 0) - weights_device[ipt] = 0.; - __syncwarp(); - continue; - } - - // Initialize each counter to 0 - if (threadIdx.x == 0) { - jCounter[0] = 0; - } - __syncwarp(); - - // Each thread will process an iCenter. Atomic operations are used to assign - // an iCenter value to each thread. - int iCenter = atomicAdd(jCounter, 1); - if (iCenter >= iParent) iCenter++; // iCenter == iParent is skipped - - // The entire warp processes the same jCenter value at the same time - int jCenter = 0; - - const double* local_rab = RAB + iCenter * LDatoms; - double ri = local_dist_scratch[ iCenter ]; - double ps = 1.; - int iCount = 0; - int cont = (iCenter < natoms); - - // We will continue iterating until all of the threads have cont set to 0 - while (__any_sync(0xffffffff, cont)) { - if (cont) { - double2 rj[weight_unroll/2]; - double2 rab_val[weight_unroll/2]; - double mu[weight_unroll]; - iCount += weight_unroll; - - #pragma unroll - for (int k = 0; k < weight_unroll/2; k++) { - rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); - rab_val[k] = *((double2*)(local_rab + jCenter) + k); - } - - #pragma unroll - for (int k = 0; k < weight_unroll/2; k++) { - mu[2*k+0] = (ri - rj[k].x) * rab_val[k].x; // XXX: RAB is symmetric - mu[2*k+1] = (ri - rj[k].y) * rab_val[k].y; - } - - #pragma unroll - for (int k = 0; k < weight_unroll; k++) { - if((iCenter != jCenter + k) && (jCenter + k < natoms)) { - mu[k] = sFrisch( mu[k] ); - ps *= mu[k]; - } - } - - // A thread is done with a iCenter based on 2 conditions. Weight tolerance - // Or if it has seen all of the jCenters - if( !(ps > weight_tol && iCount < LDatoms )) { - // In the case were the thread is done, it begins processing another iCenter - sum += ps; - iCenter = atomicAdd(jCounter, 1); - if (iCenter >= iParent) iCenter++; - - // If there are no more iCenters left to process, it signals it is ready to exit - cont = (iCenter < natoms); - ri = local_dist_scratch[ iCenter ]; - local_rab = RAB + iCenter * LDatoms; - ps = 1.; - iCount = 0; - } - } - // Wraps jCenter around. This was faster than modulo - jCenter += weight_unroll; - jCenter = (jCenter < LDatoms) ? jCenter : 0; - } - - // All of the threads then sum their contributions. Only thread 0 needs to add the parent - // contribution. - __syncwarp(); - sum = warpReduceSum(sum); - if (threadIdx.x == 0) { - sum += parent_weight; - weights_device[ipt] *= parent_weight / sum; - } - - __syncwarp(); - - } -} - - -void hip_reciprocal(size_t length, double* vec, hipStream_t stream) { - dim3 threads(max_threads_per_thread_block); - dim3 blocks( get_device_sm_count(0) ); - hipLaunchKernelGGL(reciprocal_kernel, dim3(threads), dim3(blocks), 0, stream, length, vec); -} - - -template -void partition_weights_hip_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const F* points_device, - const int32_t* iparent_device, - const F* dist_nearest_device, - const F* rab_device, - const F* atomic_coords_device, - F* weights_device, - F* dist_scratch_device, - hipStream_t stream ) { - - - - // Evaluate point-to-atom collocation - { - const int distance_thread_y = max_warps_per_thread_block / 2; - dim3 threads( warp_size, distance_thread_y ); - dim3 blocks( util::div_ceil( natoms, threads.x), - util::div_ceil( npts, threads.y * distance_thread_y) ); - - hipLaunchKernelGGL(compute_point_center_dist, dim3(blocks), dim3(threads), 0, stream, - npts, LDatoms, natoms, atomic_coords_device, points_device, dist_scratch_device - ); - - } - const bool partition_weights_1d_kernel = true; - - if( partition_weights_1d_kernel ) { - - dim3 threads( warp_size, weight_thread_block / warp_size ); - dim3 blocks( 1, get_device_sm_count(0) * weight_thread_block_per_sm); - hipLaunchKernelGGL(modify_weights_ssf_kernel_2d, dim3(blocks), dim3(threads), 0, stream , - npts, LDatoms, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, dist_nearest_device, weights_device - ); - - } else { - -#if 0 - dim3 threads( 32, 32 ); - dim3 blocks ( npts, 1 ); - - if( weight_alg == XCWeightAlg::SSF ) - hipLaunchKernelGGL(modify_weights_ssf_kernel, dim3(blocks), dim3(threads), 0, stream , - npts, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, dist_nearest_device, weights_device - ); - else - hipLaunchKernelGGL(modify_weights_becke_kernel, dim3(blocks), dim3(threads), 0, stream , - npts, natoms, rab_device, atomic_coords_device, dist_scratch_device, - iparent_device, weights_device - ); -#endif - - } - - -} - -template -void partition_weights_hip_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const double* points_device, - const int32_t* iparent_device, - const double* dist_nearest_device, - const double* rab_device, - const double* atomic_coords_device, - double* weights_device, - double* dist_scratch_device, - hipStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_weights.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_weights.hpp deleted file mode 100644 index 33fd9f4..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_weights.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include -#include -#include - -namespace GauXC { -namespace integrator { -namespace hip { - - -void hip_reciprocal(size_t length, double* vec, hipStream_t stream); - -template -void partition_weights_hip_SoA( XCWeightAlg weight_alg, - size_t npts, - size_t LDatoms, - size_t natoms, - const F* points_device, - const int32_t* iparent_device, - const F* dist_nearest_device, - const F* rab_device, - const F* atomic_coords_device, - F* weights_device, - F* dist_scratch_device, - hipStream_t stream ); - - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_zmat.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_zmat.hip deleted file mode 100644 index cd279f3..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_zmat.hip +++ /dev/null @@ -1,141 +0,0 @@ -#include "hip/hip_runtime.h" -#include "device/hip/hip_zmat.hpp" -#include -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - - -template -__global__ void zmat_lda_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - const auto nbf = task.nbe; - const auto* vrho_device = task.vrho; - - const auto* basis_eval_device = task.bf; - - auto* z_matrix_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nbf ) { - - const size_t ibfoff = tid_y * npts + tid_x; - const double fact = 0.5 * vrho_device[tid_x]; - - z_matrix_device[ ibfoff ] = fact * basis_eval_device[ ibfoff ]; - - } - -} - - - - -template -void zmat_lda_hip( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ) { - - - dim3 threads(warp_size,max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - - hipLaunchKernelGGL(zmat_lda_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); - -} - -template -void zmat_lda_hip( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - - - - -template -__global__ void zmat_gga_kernel( size_t ntasks, - XCTaskDevice* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - const auto nbf = task.nbe; - const auto* vrho_device = task.vrho; - const auto* vgamma_device = task.vgamma; - const auto* den_x_eval_device = task.ddenx; - const auto* den_y_eval_device = task.ddeny; - const auto* den_z_eval_device = task.ddenz; - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - auto* z_matrix_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < npts and tid_y < nbf ) { - - const size_t ibfoff = tid_y * npts + tid_x; - const double fact_1 = 0.5 * vrho_device[tid_x] ; - const double fact_2 = 2.0 * vgamma_device[tid_x]; - - const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ]; - const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ]; - const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ]; - - z_matrix_device[ ibfoff ] = - fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz ); - - } -} - -template -void zmat_gga_hip( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ) { - - - dim3 threads(warp_size,max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - - hipLaunchKernelGGL(zmat_gga_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); - -} -template -void zmat_gga_hip( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_zmat.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hip_zmat.hpp deleted file mode 100644 index 1e6b3f5..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hip_zmat.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator { -namespace hip { - -using namespace GauXC::hip; - -template -void zmat_lda_hip( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - -template -void zmat_gga_hip( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCTaskDevice* tasks_device, - hipStream_t stream ); - -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hipblas_extensions.hip b/third_party/gauxc/attic/src/new_integrator/device/hip/hipblas_extensions.hip deleted file mode 100644 index 5a31680..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hipblas_extensions.hip +++ /dev/null @@ -1,154 +0,0 @@ -#include "hip/hip_runtime.h" -#include "device/hip/hipblas_extensions.hpp" -#include -#include -#include "exceptions/hipblas_exception.hpp" - -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { -namespace hip { -namespace blas { - -using namespace GauXC::hip; - -template -__global__ void increment_kernel( const T* X, T* Y ) { - const auto tid = blockIdx.x; - if( tid < 1 ) (*Y) += (*X); -} - -template -void increment( const T* X, T* Y, hipStream_t stream ) { - hipLaunchKernelGGL(increment_kernel, dim3(1), dim3(1), 0, stream, X,Y); -} - -template <> -void dot( hipblasHandle_t handle, - int N, - const double* X, - int INCX, - const double* Y, - int INCY, - double* RES ) { - - auto stat = hipblasDdot( handle, N, X, INCX, Y, INCY, RES ); - GAUXC_HIPBLAS_ERROR("HIPBLAS DDOT FAILED", stat ); - -} - -template -void gdot( hipblasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* SCR, - T* RES ) { - - dot( handle, N, X, INCX, Y, INCY, SCR ); - auto stream = util::get_stream(handle); - increment( SCR, RES, stream ); - -} - -template -void gdot( hipblasHandle_t handle, - int N, - const double* X, - int INCX, - const double* Y, - int INCY, - double* SCR, - double* RES ); - - - - - - - - - - -template -void __global__ hadamard_product_kernel( int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ) { - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - if( tid_x < M and tid_y < N ) { - B[ tid_x + tid_y*LDB ] *= A[ tid_x + tid_y*LDA ]; - } - -} - - - -template -void hadamard_product( hipblasHandle_t handle, - int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ) { - - auto stream = util::get_stream(handle); - dim3 threads(warp_size, max_warps_per_thread_block); - dim3 blocks( util::div_ceil( M, threads.x ), - util::div_ceil( N, threads.y ) ); - - hipLaunchKernelGGL(hadamard_product_kernel, dim3(blocks), dim3(threads), 0, stream , M, N, A, LDA, B, LDB ); - -} - -template -void hadamard_product( hipblasHandle_t handle, - int M, - int N, - const double* A, - int LDA, - double* B, - int LDB ); - - - - -template <> -void gemm( hipblasHandle_t handle, - hipblasOperation_t TA, hipblasOperation_t TB, - int M, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, - double BETA, double* C, int LDC ) { - - auto stat = hipblasDgemm( handle, TA, TB, M, N, K, &ALPHA, A, LDA, - B, LDB, &BETA, C, LDC ); - GAUXC_HIPBLAS_ERROR("HIPBLAS DGEMM FAILED", stat); - -} - - -template <> -void syr2k( hipblasHandle_t handle, - hipblasFillMode_t UPLO, hipblasOperation_t Trans, - int M, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, - double BETA, double* C, int LDC ) { - - auto stat = hipblasDsyr2k( handle, UPLO, Trans, M, K, &ALPHA, A, LDA, B, LDB, - &BETA, C, LDC ); - GAUXC_HIPBLAS_ERROR("HIPBLAS DSYR2K FAILED", stat); - -} - -} -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hipblas_extensions.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/hipblas_extensions.hpp deleted file mode 100644 index 77ca77f..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hipblas_extensions.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace hip { -namespace blas { - -template -void dot( hipblasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* RES ); - -template -void gdot( hipblasHandle_t handle, - int N, - const T* X, - int INCX, - const T* Y, - int INCY, - T* SCR, - T* RES ); - - -template -void hadamard_product( hipblasHandle_t handle, - int M, - int N, - const T* A, - int LDA, - T* B, - int LDB ); - - -template -void gemm( hipblasHandle_t handle, - hipblasOperation_t TA, hipblasOperation_t TB, - int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, - T BETA, T* C, int LDC ); - -template -void syr2k( hipblasHandle_t handle, - hipblasFillMode_t UPLO, hipblasOperation_t Trans, - int M, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, - T BETA, T* C, int LDC ); -} -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/hipify-integrator.sh b/third_party/gauxc/attic/src/new_integrator/device/hip/hipify-integrator.sh deleted file mode 100644 index d7a95b0..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/hipify-integrator.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/sh -hipify-perl ../cuda/collocation/collocation_angular_cartesian.hpp > collocation/collocation_angular_cartesian.hpp -hipify-perl ../cuda/collocation/collocation_angular_spherical_unnorm.hpp > collocation/collocation_angular_spherical_unnorm.hpp -hipify-perl ../cuda/collocation/collocation_device_constants.hpp > collocation/collocation_device_constants.hpp -hipify-perl ../cuda/collocation/collocation_radial.hpp > collocation/collocation_radial.hpp - -#hipify-perl ../cuda/collocation_device.cu > collocation_device.hip -hipify-perl ../cuda/collocation_device.hpp > collocation_device.hpp -hipify-perl ../cuda/collocation_masked_combined_kernels.hpp > collocation_masked_combined_kernels.hpp -hipify-perl ../cuda/collocation_masked_kernels.hpp > collocation_masked_kernels.hpp -hipify-perl ../cuda/collocation_petite_combined_kernels.hpp > collocation_petite_combined_kernels.hpp -hipify-perl ../cuda/collocation_petite_kernels.hpp > collocation_petite_kernels.hpp -hipify-perl ../cuda/cublas_extensions.cu > hipblas_extensions.hip -hipify-perl ../cuda/cublas_extensions.hpp > hipblas_extensions.hpp -#hipify-perl ../cuda/cuda_eval_denvars.cu > hip_eval_denvars.hip -hipify-perl ../cuda/cuda_eval_denvars.hpp > hip_eval_denvars.hpp -hipify-perl ../cuda/cuda_extensions.hpp > hip_extensions.hpp -hipify-perl ../cuda/cuda_alg_variant_control.hpp > hip_alg_variant_control.hpp -hipify-perl ../cuda/cuda_inc_potential.cu > hip_inc_potential.hip -hipify-perl ../cuda/cuda_inc_potential.hpp > hip_inc_potential.hpp -hipify-perl ../cuda/cuda_pack_density.cu > hip_pack_density.hip -hipify-perl ../cuda/cuda_pack_density.hpp > hip_pack_density.hpp -hipify-perl ../cuda/cuda_weights.cu > hip_weights.hip -hipify-perl ../cuda/cuda_weights.hpp > hip_weights.hpp -hipify-perl ../cuda/cuda_zmat.cu > hip_zmat.hip -hipify-perl ../cuda/cuda_zmat.hpp > hip_zmat.hpp - - -hipify-perl ../cuda/cuda_device_properties.cxx > hip_device_properties.hip -hipify-perl ../cuda/local_work_replicated_incore_exc_vxc.cxx > local_work_replicated_incore_exc_vxc.cxx -hipify-perl ../cuda/xc_cuda_data.cxx > xc_hip_data.cxx - -sed -i -e "s/cuda/hip/g" *.cxx *.hip *.hpp collocation/*.hpp -sed -i -e "s/CUDA/HIP/g" *.cxx *.hip *.hpp collocation/*.hpp -sed -i -e "s/Cuda/Hip/g" *.cxx *.hip *.hpp collocation/*.hpp -sed -i -e "s/cublas/hipblas/g" *.cxx *.hip *.hpp collocation/*.hpp -sed -i -e "s/CUBLAS/HIPBLAS/g" *.cxx *.hip *.hpp collocation/*.hpp - -sed -i -e "s/__global__/__global__\n__launch_bounds__(1024,1)/g" \ - collocation_masked_combined_kernels.hpp collocation_masked_kernels.hpp collocation_petite_combined_kernels.hpp collocation_petite_kernels.hpp - -sed -i -e "s/register //g" *.hip *.hpp -sed -i -e "s/#define GAUXC_ENABLE_WARP_REDUCTIONS/\/\/#define GAUXC_ENABLE_WARP_REDUCTIONS/g" hip_alg_variant_control.hpp diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/local_work_replicated_incore_exc_vxc.cxx b/third_party/gauxc/attic/src/new_integrator/device/hip/local_work_replicated_incore_exc_vxc.cxx deleted file mode 100644 index 2d941f2..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/local_work_replicated_incore_exc_vxc.cxx +++ /dev/null @@ -1,422 +0,0 @@ -#include -#include -#include - -#include "device/hip/hip_weights.hpp" -#include "device/hip/collocation_device.hpp" -#include "device/hip/hip_pack_density.hpp" -#include "device/hip/hip_inc_potential.hpp" -#include "device/hip/hip_eval_denvars.hpp" -#include "device/hip/hip_zmat.hpp" -#include "common/integrator_common.hpp" - -#include "device/hip/hipblas_extensions.hpp" -#include "device/hip/local_work_replicated_incore_exc_vxc.hpp" - -#include "device/hip/xc_hip_data.hpp" - -namespace GauXC { - -namespace integrator::hip { - -using namespace GauXC::hip::blas; - - -template -using hip_task_iterator = typename std::vector>::iterator; - -template -void local_work_replicated_density_incore_exc_vxc( - XCWeightAlg weight_alg, - const functional_type& func, - XCHipData& hip_data, - hip_task_iterator task_begin, - hip_task_iterator task_end -) { - - const auto ntasks = std::distance( task_begin, task_end ); - const auto nbf = hip_data.nbf; - - // Get batch statistics for batches to process - auto nbe_comparator = - []( const auto& a, const auto& b ){ return a.nbe < b.nbe; }; - auto npts_comparator = - []( const auto& a, const auto& b ){ return a.npts < b.npts; }; - auto nshells_comparator = - []( const auto& a, const auto& b ){ return a.nshells < b.nshells; }; - - auto [min_nbe_it, max_nbe_it] = - std::minmax_element( task_begin, task_end, nbe_comparator ); - auto [min_npts_it, max_npts_it] = - std::minmax_element( task_begin, task_end, npts_comparator ); - auto [min_nshells_it, max_nshells_it] = - std::minmax_element( task_begin, task_end, nshells_comparator ); - - const auto min_nbe = min_nbe_it->nbe; - const auto max_nbe = max_nbe_it->nbe; - const auto min_npts = min_npts_it->npts; - const auto max_npts = max_npts_it->npts; - const auto min_nshells = min_nshells_it->nshells; - const auto max_nshells = max_nshells_it->nshells; - - util::unused( min_nbe, min_npts, min_nshells ); - - const size_t total_npts = - std::accumulate( task_begin, task_end, 0ul, - []( const auto& a, const auto& b ) { return a + b.npts; } ); - - - // Aliases - hipStream_t master_stream = *hip_data.master_stream; - hipblasHandle_t master_handle = *hip_data.master_handle; - -#ifdef GAUXC_ENABLE_MAGMA - magma_queue_t master_queue = *hip_data.master_magma_queue; -#endif - - auto* dmat_device = hip_data.dmat_device; - - auto* shells_device = hip_data.shells_device; - auto* tasks_device = hip_data.device_tasks; - auto* dmat_array_device = hip_data.dmat_array_device; - auto* zmat_array_device = hip_data.zmat_array_device; - auto* bf_array_device = hip_data.bf_array_device; - auto* weights_device = hip_data.weights_device_buffer; - auto* dist_scratch_device = hip_data.dist_scratch_device; - - auto* den_eval_device = hip_data.den_eval_device; - auto* dden_x_eval_device = hip_data.den_x_eval_device; - auto* dden_y_eval_device = hip_data.den_y_eval_device; - auto* dden_z_eval_device = hip_data.den_z_eval_device; - - auto* eps_eval_device = hip_data.eps_eval_device; - auto* gamma_eval_device = hip_data.gamma_eval_device; - auto* vrho_eval_device = hip_data.vrho_eval_device; - auto* vgamma_eval_device = hip_data.vgamma_eval_device; - - - auto* exc_device = hip_data.exc_device; - auto* vxc_device = hip_data.vxc_device; - auto* nel_device = hip_data.nel_device; - auto* acc_scr_device = hip_data.acc_scr_device; - - auto* m_array_device = hip_data.m_array_device; - auto* n_array_device = hip_data.n_array_device; - auto* k_array_device = hip_data.k_array_device; - auto* lda_array_device = hip_data.lda_array_device; - auto* ldb_array_device = hip_data.ldb_array_device; - auto* ldc_array_device = hip_data.ldc_array_device; - - - const auto* rab_device = hip_data.rab_device; - const auto* coords_device = hip_data.coords_device; - const auto* points_device = hip_data.points_device_buffer; - const auto* iparent_device = hip_data.iparent_device_buffer; - const auto* dist_nearest_device = hip_data.dist_nearest_buffer; - - - - - // Evaluate Partition Weights - partition_weights_hip_SoA( weight_alg, total_npts, hip_data.LDatoms, hip_data.natoms, - points_device, iparent_device, dist_nearest_device, - rab_device, coords_device, weights_device, - dist_scratch_device, master_stream ); - - - // Evaluate Collocation - if constexpr ( n_deriv == 1 ) - eval_collocation_masked_combined_deriv1( ntasks, max_npts, max_nshells, - shells_device, tasks_device, - master_stream ); - else - eval_collocation_masked_combined( ntasks, max_npts, max_nshells, shells_device, - tasks_device, master_stream ); - - // Pack Density Submatrices - task_pack_density_matrix( ntasks, tasks_device, dmat_device, nbf, master_stream ); - - - // Form Z = P * X - if( hip_data.batch_l3_blas ) { - -#ifdef GAUXC_ENABLE_MAGMA - - magmablas_dgemm_vbatched( MagmaNoTrans, MagmaNoTrans, - m_array_device, n_array_device, k_array_device, - 1., bf_array_device, ldb_array_device, - dmat_array_device, lda_array_device, - 0., zmat_array_device, ldc_array_device, - ntasks, master_queue ); - -#else - - throw std::runtime_error("BATCHED BLAS API NOT SUPPORTED"); - -#endif - - } else { - - int nstream = hip_data.blas_streams.size(); - - // Wait for collocation etc - util::hip_event master_event; - master_event.record( master_stream ); - for( int iS = 0; iS < nstream; ++iS ) - hip_data.blas_streams[iS].wait( master_event ); - - // Do GEMM in round-robin - for( auto iT = 0; iT < ntasks; ++iT ) { - auto& task = *(task_begin + iT); - gemm( hip_data.blas_handles[iT % nstream], HIPBLAS_OP_N, HIPBLAS_OP_N, - task.npts, task.nbe, task.nbe, 1., task.bf, task.npts, - task.nbe_scr, task.nbe, 0., task.zmat, task.npts ); - } - - // Record completion of BLAS ops - std::vector< util::hip_event > blas_events( nstream ); - for( int iS = 0; iS < nstream; ++iS ) - blas_events[iS].record( hip_data.blas_streams[iS] ); - - // Wait on master stream for all BLAS ops to complete - for( int iS = 0; iS < nstream; ++iS ) - hip_data.master_stream->wait( blas_events[iS] ); - - } - - - - // Zero UVars - util::hip_set_zero_async( total_npts, den_eval_device, master_stream, "DenZero" ); - if( func.is_gga() ) { - util::hip_set_zero_async( total_npts, dden_x_eval_device, master_stream, - "DenXZero" ); - util::hip_set_zero_async( total_npts, dden_y_eval_device, master_stream, - "DenYZero" ); - util::hip_set_zero_async( total_npts, dden_z_eval_device, master_stream, - "DenZZero" ); - } - - // Evaluate UVars - if( func.is_gga() ) { - eval_uvars_gga_device( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - eval_vvars_gga_device( total_npts, dden_x_eval_device, dden_y_eval_device, - dden_z_eval_device, gamma_eval_device, master_stream ); - } else { - eval_uvars_lda_device( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - } - - // Evaluate XC Functional - if( func.is_gga() ) - func.eval_exc_vxc_device( total_npts, den_eval_device, gamma_eval_device, - eps_eval_device, vrho_eval_device, - vgamma_eval_device, master_stream ); - else - func.eval_exc_vxc_device( total_npts, den_eval_device, eps_eval_device, - vrho_eval_device, master_stream ); - - - // Factor weights into XC output - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - eps_eval_device, 1 ); - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - vrho_eval_device, 1 ); - if( func.is_gga() ) - hadamard_product( master_handle, total_npts, 1, weights_device, 1, - vgamma_eval_device, 1 ); - - // Accumulate EXC / NEL - gdot( master_handle, total_npts, weights_device, 1, - den_eval_device, 1, acc_scr_device, nel_device ); - gdot( master_handle, total_npts, eps_eval_device, 1, - den_eval_device, 1, acc_scr_device, exc_device ); - - // Evaluate Z Matrix - if( func.is_gga() ) - zmat_gga_hip( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - else - zmat_lda_hip( ntasks, max_nbe, max_npts, tasks_device, master_stream ); - - - - // Accumulate packed VXC = X * Z**T + Z * X**T - - - if( hip_data.batch_l3_blas ) { - -#ifdef GAUXC_ENABLE_MAGMA - - // XXX: Only updates LT - magmablas_dsyr2k_vbatched( MagmaLower, MagmaTrans, - n_array_device, m_array_device, - 1., bf_array_device, ldb_array_device, - zmat_array_device, ldc_array_device, - 0., dmat_array_device, lda_array_device, - ntasks, master_queue ); - -#else - - throw std::runtime_error("BATCHED BLAS API NOT SUPPORTED"); - -#endif - } else { - - int nstream = hip_data.blas_streams.size(); - - // Wait for zmat, etc - util::hip_event master_event; - master_event.record( master_stream ); - for( int iS = 0; iS < nstream; ++iS ) - hip_data.blas_streams[iS].wait( master_event ); - - // Do SYR2K in round-robin - for( auto iT = 0; iT < ntasks; ++iT ) { - auto& task = *(task_begin + iT); - syr2k( hip_data.blas_handles[iT % nstream], HIPBLAS_FILL_MODE_LOWER, - HIPBLAS_OP_T, task.nbe, task.npts, 1., task.bf, task.npts, - task.zmat, task.npts, 0., task.nbe_scr, task.nbe ); - } - - // Record completion of BLAS ops - std::vector< util::hip_event > blas_events( nstream ); - for( int iS = 0; iS < nstream; ++iS ) - blas_events[iS].record( hip_data.blas_streams[iS] ); - - // Wait on master stream for all BLAS ops to complete - for( int iS = 0; iS < nstream; ++iS ) - hip_data.master_stream->wait( blas_events[iS] ); - } - - // Increment global VXC - task_inc_potential( ntasks, tasks_device, vxc_device, nbf, master_stream ); - - - // Synchronize on master stream - // XXX: There's no lifetime issues in this driver, should look into - // avoid this sync to allow for overlap with the host packing - hipStreamSynchronize( master_stream ); - -} - - -template -void local_work_replicated_incore_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* EXC, - F* NEL -) { - - auto& hip_data = dynamic_cast< XCHipData& >( device_data ); - - auto task_comparator = []( const XCTask& a, const XCTask& b ) { - return (a.points.size() * a.nbe) > (b.points.size() * b.nbe); - }; - std::sort( local_work_begin, local_work_end, task_comparator ); - - - const auto nbf = basis.nbf(); - const auto natoms = meta.natoms(); - const auto LDatoms = hip_data.LDatoms; - - // Send static data to the device - - // Density - util::hip_copy( nbf * nbf, hip_data.dmat_device, P, "P H2D" ); - - // Shells: TODO avoid host copy? - std::vector> shells( basis ); - util::hip_copy( shells.size(), hip_data.shells_device, shells.data(), - "Shells H2D" ); - - // RAB - util::hip_copy_2d( hip_data.rab_device, LDatoms * sizeof(F), - meta.rab().data(), natoms * sizeof(F), - natoms * sizeof(F), natoms, "RAB H2D"); - // This could probably happen on the host - hip_reciprocal(natoms * LDatoms, hip_data.rab_device, 0); - - // Atomic coordinates - std::vector coords( 3*natoms ); - for( auto i = 0ul; i < natoms; ++i ) { - coords[ 3*i + 0 ] = mol[i].x; - coords[ 3*i + 1 ] = mol[i].y; - coords[ 3*i + 2 ] = mol[i].z; - } - util::hip_copy( 3 * natoms, hip_data.coords_device, coords.data(), - "Coords H2D" ); - - - // Zero out XC quantities - util::hip_set_zero( nbf * nbf, hip_data.vxc_device, "VXC Zero" ); - util::hip_set_zero( 1 , hip_data.exc_device, "EXC Zero" ); - util::hip_set_zero( 1 , hip_data.nel_device, "NEL Zero" ); - - - - // Processes batches in groups that saturadate available device memory - auto task_it = local_work_begin; - while( task_it != local_work_end ) { - - // Determine next task batch, send relevant data to device - auto [it, tasks_device] = - hip_data.generate_buffers( basis, task_it, local_work_end ); - - - // Process the batches - local_work_replicated_density_incore_exc_vxc( - weight_alg, func, hip_data, tasks_device.begin(), tasks_device.end() - ); - - task_it = it; - - } - - // Receive XC terms from host - util::hip_copy( nbf * nbf, VXC, hip_data.vxc_device, "VXC D2H" ); - - util::hip_copy( 1, EXC, hip_data.exc_device, "EXC D2H" ); - util::hip_copy( 1, NEL, hip_data.nel_device, "NEL D2H" ); - - // Symmetrize VXC - for( int32_t j = 0; j < nbf; ++j ) - for( int32_t i = j+1; i < nbf; ++i ) - VXC[ j + i*nbf ] = VXC[ i + j*nbf ]; - -} - - -#define HIP_IMPL( F, ND ) \ -template \ -void local_work_replicated_incore_exc_vxc_impl(\ - XCWeightAlg weight_alg,\ - XCIntegratorState state,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCDeviceData & device_data,\ - host_task_iterator local_work_begin,\ - host_task_iterator local_work_end,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -HIP_IMPL( double, 0 ); -HIP_IMPL( double, 1 ); - -} -} - - diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/local_work_replicated_incore_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/device/hip/local_work_replicated_incore_exc_vxc.hpp deleted file mode 100644 index 5636d34..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/local_work_replicated_incore_exc_vxc.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include - -#include "device/xc_device_data.hpp" - -namespace GauXC { - -namespace integrator::hip { - -using host_task_iterator = std::vector::iterator; - -template -void local_work_replicated_incore_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* exc, - F* n_el -); - - -template -inline void local_work_replicated_incore_exc_vxc( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - local_work_replicated_incore_exc_vxc_impl( std::forward(args)... ); - else if( n_deriv == 1 ) - local_work_replicated_incore_exc_vxc_impl( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/hip/xc_hip_data.cxx b/third_party/gauxc/attic/src/new_integrator/device/hip/xc_hip_data.cxx deleted file mode 100644 index d5d1839..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/hip/xc_hip_data.cxx +++ /dev/null @@ -1,552 +0,0 @@ -#include "device/hip/xc_hip_data.hpp" -#include - -#include "device/buffer_adaptor.hpp" -#include "common/integrator_common.hpp" -#include "device/hip/hip_device_properties.hpp" - -namespace GauXC { - - -namespace integrator::device { - -template -std::shared_ptr< XCDeviceData > make_device_data() { - return std::make_shared< XCHipData >(); -} - -template std::shared_ptr> make_device_data(); - -} - - - - - - - - -template -XCHipData::XCHipData( bool _batch_l3_blas ): -#ifdef GAUXC_ENABLE_MAGMA - batch_l3_blas(_batch_l3_blas) -#else - batch_l3_blas(false) -#endif -{ - - // TODO: Expose this - double fill_fraction = 0.9; - - hipError_t stat; - - // Get Total Available Memory - size_t hip_avail, hip_total; - stat = hipMemGetInfo( &hip_avail, &hip_total ); - GAUXC_HIP_ERROR( "MemInfo Failed", stat ); - - // Allocate up to fill_fraction - devmem_sz = fill_fraction * hip_avail; - stat = hipMalloc( &device_ptr, devmem_sz ); - GAUXC_HIP_ERROR( "HIP Malloc Failed", stat ); - - // Create HIP Stream and HIPBLAS Handles and make them talk to eachother - master_stream = std::make_unique< util::hip_stream >(); - master_handle = std::make_unique< util::hipblas_handle >(); - - hipblasSetStream( *master_handle, *master_stream ); - -#ifdef GAUXC_ENABLE_MAGMA - // Create MAGMA Queue from HIP Stream and HIPBLAS Handle - master_magma_queue = - std::make_unique< util::magma_queue >( 0, *master_stream, *master_handle ); -#endif - - if( not batch_l3_blas ) { - - // Create BLAS streams - blas_streams.resize(4); - blas_handles.resize(4); - for( auto i = 0; i < 4; ++i ) - hipblasSetStream( blas_handles[i], blas_streams[i] ); - - } - -} - - - -template -XCHipData::~XCHipData() noexcept { - if( device_ptr ) util::hip_free( device_ptr ); -} - - - - - - - -template -void XCHipData::allocate_static_data( size_t _natoms, - size_t _n_deriv, - size_t _nbf, - size_t _nshells ) { - - - // Save state - nshells = _nshells; - nbf = _nbf; - n_deriv = _n_deriv; - natoms = _natoms; - - LDatoms = util::div_ceil( natoms, hip::weight_unroll ) * hip::weight_unroll; - - // Allocate static memory with proper alignment - buffer_adaptor mem( device_ptr, devmem_sz ); - - shells_device = mem.aligned_alloc>( nshells ); - exc_device = mem.aligned_alloc( 1 ); - nel_device = mem.aligned_alloc( 1 ); - acc_scr_device = mem.aligned_alloc( 1 ); - rab_device = mem.aligned_alloc( LDatoms * natoms, sizeof(double2)); - coords_device = mem.aligned_alloc( 3 * natoms ); - - vxc_device = mem.aligned_alloc( nbf * nbf ); - dmat_device = mem.aligned_alloc( nbf * nbf ); - - // Get current stack location - dynmem_ptr = mem.stack(); - dynmem_sz = mem.nleft(); - -} - - - - -using task_iterator = std::vector< XCTask >::iterator; -template -using device_task_container = std::vector< hip::XCTaskDevice >; - -template -std::tuple< task_iterator, device_task_container > - XCHipData::generate_buffers( const BasisSet& basis, - task_iterator task_begin, - task_iterator task_end ) { - - // Host data packing arrays - std::vector< std::array > points_pack; - std::vector< double > weights_pack; - std::vector< size_t > shell_list_pack; - std::vector< size_t > shell_offs_pack; - std::vector< std::array > submat_cut_pack; - std::vector< int32_t > submat_block_pack; - std::vector< int32_t > iparent_pack; - std::vector< double > dist_nearest_pack; - - // Host copies for batched GEMM/SYRK arrays - std::vector< double* > dmat_array, bf_array, zmat_array; - std::vector< int > m_array, n_array, k_array, lda_array, ldb_array, ldc_array; - - device_task_container tasks_device; - - - auto concat_iterable = []( auto& a, const auto& b ) { - a.insert( a.end(), b.begin(), b.end() ); - }; - - - size_t ntask = 0; - size_t total_npts = 0; - size_t total_nbe_nbe = 0; - size_t total_nbe_npts = 0; - size_t total_nshells = 0; - size_t total_ncut = 0; - size_t total_nblock = 0; - size_t memleft = dynmem_sz; - - uint32_t submat_chunk_size = hip::get_submat_cut_block(nbf, 0); - - // Offset memory by the static requirement of an extra pointer element - // for each of the size batch arrays in MAGMA - memleft -= 6 * sizeof(int); //M,N,K,LDA,LDB,LDC - - auto task_it = task_begin; - while( task_it != task_end ) { - - auto iAtom = task_it->iParent; - auto points = task_it->points ; - auto weights = task_it->weights ; - auto shell_list = task_it->shell_list; - auto nbe = task_it->nbe; - auto dist_nearest = task_it->dist_nearest; - - // Generate map from compressed to non-compressed matrices - auto [submat_cut, submat_block] = integrator::gen_compressed_submat_map( basis, shell_list, nbf, submat_chunk_size ); - size_t ncut = submat_cut.size(); - size_t nblock = submat_block.size(); - size_t nshells = shell_list.size(); - size_t npts = points.size(); - - - size_t mem_points = 3 * npts; - size_t mem_weights = npts; - - size_t mem_shells = nshells; - size_t mem_shell_list = nshells; - size_t mem_shell_offs = nshells; - size_t mem_submat_cut = 3 * ncut; - size_t mem_submat_block = nblock; - - size_t mem_nbe_scr = nbe * nbe; - size_t mem_zmat = nbe * npts; - - size_t mem_bf = nbe * npts; - size_t mem_dbfx = mem_bf; - size_t mem_dbfy = mem_bf; - size_t mem_dbfz = mem_bf; - - size_t mem_den = npts; - size_t mem_denx = npts; - size_t mem_deny = npts; - size_t mem_denz = npts; - - size_t mem_eps = npts; - size_t mem_gamma = npts; - size_t mem_vrho = npts; - size_t mem_vgamma = npts; - - //size_t mem_partition_scr = natoms * npts; - size_t mem_dist_scr = LDatoms * npts; - size_t mem_iparent = npts; - size_t mem_dist_nearest = npts; - - size_t mem_batch_mat_arr = 3; // dmat/zmat/bf - size_t mem_batch_sz_arr = 6; // M/N/K/LDA/LDB/LDC - size_t mem_task = 1; - - - size_t mem_req_batch = - mem_points * sizeof(double) + - mem_weights * sizeof(double) + - mem_shells * sizeof(Shell) + - mem_shell_list * sizeof(size_t) + - mem_shell_offs * sizeof(size_t) + - mem_submat_cut * sizeof(int32_t) + - mem_submat_block * sizeof(int32_t) + - mem_nbe_scr * sizeof(double) + - mem_zmat * sizeof(double) + - mem_bf * sizeof(double) + - mem_dbfx * sizeof(double) + - mem_dbfy * sizeof(double) + - mem_dbfz * sizeof(double) + - mem_den * sizeof(double) + - mem_denx * sizeof(double) + - mem_deny * sizeof(double) + - mem_denz * sizeof(double) + - mem_eps * sizeof(double) + - mem_gamma * sizeof(double) + - mem_vrho * sizeof(double) + - mem_vgamma * sizeof(double) + - //mem_partition_scr * sizeof(double) + - mem_dist_scr * sizeof(double) + - mem_iparent * sizeof(int32_t) + - mem_dist_nearest * sizeof(double) + - mem_batch_mat_arr * sizeof(double*) + - mem_batch_sz_arr * sizeof(int32_t) + - mem_task * sizeof(hip::XCTaskDevice); - - //std::cout << "Memory requirement for task " << ntask+1 << " " << mem_req_batch << " memleft " << memleft << std::endl; - - if( mem_req_batch > memleft ) break; - - // Update memory and increment task iterator - memleft -= mem_req_batch; - ntask++; - task_it++; - - // Update counters - total_npts += npts; - total_nbe_nbe += nbe*nbe; - total_nbe_npts += nbe*npts; - total_nshells += nshells; - total_ncut += ncut; - total_nblock += nblock; - - // Compute offsets - std::vector< size_t > shell_offs( nshells ); - shell_offs.at(0) = 0; - for( auto i = 1ul; i < nshells; ++i ) - shell_offs.at(i) = shell_offs.at(i-1) + - basis.at( shell_list.at(i-1) ).size(); - - - // Pack the data on host - concat_iterable( points_pack, points ); - concat_iterable( weights_pack, weights ); - concat_iterable( shell_list_pack, shell_list ); - concat_iterable( shell_offs_pack, shell_offs ); - concat_iterable( submat_cut_pack, submat_cut ); - concat_iterable( submat_block_pack, submat_block ); - - m_array.emplace_back( npts ); - n_array.emplace_back( nbe ); - k_array.emplace_back( nbe ); - - lda_array.emplace_back( nbe ); - ldb_array.emplace_back( npts ); - ldc_array.emplace_back( npts ); - - iparent_pack.insert( iparent_pack.end(), npts, iAtom ); - dist_nearest_pack.insert( dist_nearest_pack.end(), npts, dist_nearest ); - - // Add task - tasks_device.emplace_back(); - - tasks_device.back().nbe = nbe; - tasks_device.back().npts = npts; - tasks_device.back().ncut = ncut; - tasks_device.back().nblock = nblock; - tasks_device.back().nshells = nshells; - tasks_device.back().iParent = iAtom; - tasks_device.back().dist_nearest = dist_nearest; - } - - - std::cout << "XCDeviceData will stack allocate for " << tasks_device.size() << " tasks"; - std::cout << " Using chunk size of " << submat_chunk_size << std::endl; - - // Allocate out of dynamic memory - buffer_adaptor mem( dynmem_ptr, dynmem_sz ); - - // (possibly) Large types - important_shells_device = mem.aligned_alloc>( total_nshells ); - device_tasks = mem.aligned_alloc>( ntask ); - - // 64-bit types - nbe_scr_device = mem.aligned_alloc( total_nbe_nbe ); - zmat_device = mem.aligned_alloc( total_nbe_npts ); - bf_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_x_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_y_eval_device = mem.aligned_alloc( total_nbe_npts ); - dbf_z_eval_device = mem.aligned_alloc( total_nbe_npts ); - - den_eval_device = mem.aligned_alloc( total_npts ); - eps_eval_device = mem.aligned_alloc( total_npts ); - vrho_eval_device = mem.aligned_alloc( total_npts ); - - den_x_eval_device = mem.aligned_alloc( total_npts ); - den_y_eval_device = mem.aligned_alloc( total_npts ); - den_z_eval_device = mem.aligned_alloc( total_npts ); - gamma_eval_device = mem.aligned_alloc( total_npts ); - vgamma_eval_device = mem.aligned_alloc( total_npts ); - - points_device_buffer = mem.aligned_alloc( 3 * total_npts ); - weights_device_buffer = mem.aligned_alloc( total_npts ); - shell_list_device_buffer = mem.aligned_alloc( total_nshells ); - shell_offs_device_buffer = mem.aligned_alloc( total_nshells ); - submat_cut_device_buffer = mem.aligned_alloc( 3 * total_ncut ); - submat_block_device_buffer = mem.aligned_alloc( total_nblock ); - - dist_scratch_device = mem.aligned_alloc( LDatoms * total_npts, 2 * sizeof(double) ); - dist_nearest_buffer = mem.aligned_alloc( total_npts ); - - dmat_array_device = mem.aligned_alloc( ntask ); - zmat_array_device = mem.aligned_alloc( ntask ); - bf_array_device = mem.aligned_alloc( ntask ); - - // 32-bit types - m_array_device = mem.aligned_alloc( ntask + 1 ); - n_array_device = mem.aligned_alloc( ntask + 1 ); - k_array_device = mem.aligned_alloc( ntask + 1 ); - lda_array_device = mem.aligned_alloc( ntask + 1 ); - ldb_array_device = mem.aligned_alloc( ntask + 1 ); - ldc_array_device = mem.aligned_alloc( ntask + 1 ); - - iparent_device_buffer = mem.aligned_alloc( total_npts ); - - - // Update tasks with allocated pointers - { - double* points_ptr = points_device_buffer; - double* weights_ptr = weights_device_buffer; - - size_t* shell_list_ptr = shell_list_device_buffer; - size_t* shell_offs_ptr = shell_offs_device_buffer; - int32_t* submat_cut_ptr = submat_cut_device_buffer; - int32_t* submat_block_ptr = submat_block_device_buffer; - Shell * shells_ptr = important_shells_device; - double* nbe_ptr = nbe_scr_device; - double* zmat_ptr = zmat_device; - - double* bf_ptr = bf_eval_device; - double* dbfx_ptr = dbf_x_eval_device; - double* dbfy_ptr = dbf_y_eval_device; - double* dbfz_ptr = dbf_z_eval_device; - - double* den_ptr = den_eval_device; - double* ddenx_ptr = den_x_eval_device; - double* ddeny_ptr = den_y_eval_device; - double* ddenz_ptr = den_z_eval_device; - - double* eps_ptr = eps_eval_device; - double* gamma_ptr = gamma_eval_device; - double* vrho_ptr = vrho_eval_device; - double* vgamma_ptr = vgamma_eval_device; - - - double* dist_scratch_ptr = dist_scratch_device; - - for( auto& task : tasks_device ) { - - task.points = points_ptr; - task.weights = weights_ptr; - task.shell_list = shell_list_ptr; - task.shell_offs = shell_offs_ptr; - task.submat_cut = submat_cut_ptr; - task.submat_block = submat_block_ptr; - - task.shells = shells_ptr; - task.nbe_scr = nbe_ptr; - task.zmat = zmat_ptr; - task.bf = bf_ptr; - task.dbfx = dbfx_ptr; - task.dbfy = dbfy_ptr; - task.dbfz = dbfz_ptr; - task.den = den_ptr; - task.ddenx = ddenx_ptr; - task.ddeny = ddeny_ptr; - task.ddenz = ddenz_ptr; - - task.eps = eps_ptr; - task.gamma = gamma_ptr; - task.vrho = vrho_ptr; - task.vgamma = vgamma_ptr; - - task.dist_scratch = dist_scratch_ptr; - - auto npts = task.npts; - auto nbe = task.nbe; - auto nshells = task.nshells; - auto ncut = task.ncut; - auto nblock = task.nblock; - - points_ptr += 3 * npts; - weights_ptr += npts; - shell_list_ptr += nshells; - shell_offs_ptr += nshells; - submat_cut_ptr += 3 * ncut; - submat_block_ptr += nblock; - - shells_ptr += nshells; - nbe_ptr += nbe * nbe; - zmat_ptr += nbe * npts; - - bf_ptr += nbe * npts; - dbfx_ptr += nbe * npts; - dbfy_ptr += nbe * npts; - dbfz_ptr += nbe * npts; - - den_ptr += npts; - ddenx_ptr += npts; - ddeny_ptr += npts; - ddenz_ptr += npts; - - eps_ptr += npts; - gamma_ptr += npts; - vrho_ptr += npts; - vgamma_ptr += npts; - - dist_scratch_ptr += LDatoms * npts; - - - - // Batched LA - dmat_array.emplace_back( task.nbe_scr ); - bf_array.emplace_back( task.bf ); - zmat_array.emplace_back( task.zmat ); - } - - } // End task setup - - - - - auto copy_rev = [&]( size_t n, const auto* src, auto* dest, hipStream_t stream, - std::string m ) { - util::hip_copy_async( n, dest, src, stream, m ); - }; - - - - try { - - // Send the data to the device - copy_rev( 3*points_pack.size(), points_pack.data()->data(), - points_device_buffer, *master_stream, - "send points buffer" ); - copy_rev( weights_pack.size(), weights_pack.data(), - weights_device_buffer, *master_stream, - "send weights buffer" ); - - copy_rev( shell_list_pack.size(), shell_list_pack.data(), - shell_list_device_buffer, *master_stream, - "send_shell_list_buffer" ); - copy_rev( shell_offs_pack.size(), shell_offs_pack.data(), - shell_offs_device_buffer, *master_stream, - "send_shell_offs_buffer" ); -// std::cout << "Element size " << sizeof(std::get<0>(submat_cut_pack[0]) << std::endl; - copy_rev( 3 * submat_cut_pack.size(), submat_cut_pack.data()->data(), - submat_cut_device_buffer, *master_stream, - "send_submat_cut_buffer" ); - copy_rev( submat_block_pack.size(), submat_block_pack.data(), - submat_block_device_buffer, *master_stream, - "send_submat_block_buffer" ); - - copy_rev( tasks_device.size(), tasks_device.data(), device_tasks, - *master_stream, "send_tasks_device" ); - - - copy_rev( dmat_array.size(), dmat_array.data(), dmat_array_device, - *master_stream, "send dmat_array" ); - copy_rev( zmat_array.size(), zmat_array.data(), zmat_array_device, - *master_stream, "send zmat_array" ); - copy_rev( bf_array.size(), bf_array.data(), bf_array_device, - *master_stream, "send bf_array" ); - - copy_rev( m_array.size(), m_array.data(), m_array_device, - *master_stream, "send m_array" ); - copy_rev( n_array.size(), n_array.data(), n_array_device, - *master_stream, "send n_array" ); - copy_rev( k_array.size(), k_array.data(), k_array_device, - *master_stream, "send k_array" ); - - copy_rev( lda_array.size(), lda_array.data(), lda_array_device, - *master_stream, "send lda_array" ); - copy_rev( ldb_array.size(), ldb_array.data(), ldb_array_device, - *master_stream, "send ldb_array" ); - copy_rev( ldc_array.size(), ldc_array.data(), ldc_array_device, - *master_stream, "send ldc_array" ); - - copy_rev( iparent_pack.size(), iparent_pack.data(), - iparent_device_buffer, *master_stream, "send iparent" ); - copy_rev( dist_nearest_pack.size(), dist_nearest_pack.data(), - dist_nearest_buffer, *master_stream, "send dist_nearest" ); - - } catch(...) { - //teardown_(); throw; - throw; - } - - - // To avoid packed vectors going out of scope - hipStreamSynchronize( *master_stream ); - - return std::make_tuple(task_it, tasks_device); -} - - -// Explicit Instantiations -template class XCHipData; - -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/incore_xc_device_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/device/incore_xc_device_exc_vxc.hpp deleted file mode 100644 index 9558ad2..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/incore_xc_device_exc_vxc.hpp +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include - -#include "device/local_work_replicated_incore_exc_vxc.hpp" -#include -#include -#include "exceptions/magma_exception.hpp" - -namespace GauXC { -namespace detail { - -template -void IncoreXCDeviceIntegrator:: - eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) { - -#ifdef GAUXC_ENABLE_MAGMA - // Initialize MAGMA - { - auto ierr = magma_init(); - GAUXC_MAGMA_ERROR( "MAGMA Init Failed", ierr ); - } -#endif - - util::unused(m,n,ldp,ldvxc); - - size_t nbf = this->basis_->nbf(); - - //// TODO: Check that P is sane - - - // Generate Tasks - auto& tasks = this->load_balancer_->get_tasks(); - size_t n_deriv = this->func_->is_gga() ? 1 : 0; - - // Allocate Memory - auto device_data = this->timer_.time_op("XCIntegrator.DeviceAlloc",[&]() { - auto ptr = GauXC::integrator::device::make_device_data(); - ptr->allocate_static_data( this->load_balancer_->molecule().natoms(), - n_deriv, this->basis_->nbf(), - this->basis_->size() ); - return ptr; - }); - - value_type N_EL; - - // Compute Local contributions to EXC / VXC - this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - GauXC::integrator::device::local_work_replicated_incore_exc_vxc< value_type >( - n_deriv, XCWeightAlg::SSF, state_, *this->func_, - *this->basis_, this->load_balancer_->molecule(), - this->load_balancer_->molmeta(), *device_data, tasks, P, - VXC, EXC, &N_EL - ); - }); - - // Update State of Integrator - state_.load_balancer_populated = true; - //state_.modified_weights_are_stored = true; - - -#ifdef GAUXC_ENABLE_MPI - - int world_size; - MPI_Comm_size( this->comm_, &world_size ); - - if( world_size > 1 ) { - - this->timer_.time_op("XCIntegrator.Allreduce", [&](){ - // Test of communicator is an inter-communicator - // XXX: Can't think of a case when this would be true, but who knows... - int inter_flag; - MPI_Comm_test_inter( this->comm_, &inter_flag ); - - // Is Intra-communicator, Allreduce can be done inplace - if( not inter_flag ) { - - MPI_Allreduce( MPI_IN_PLACE, VXC, nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - // Isn't Intra-communicator (weird), Allreduce can't be done inplace - } else { - - std::allocator alloc; - auto VXC_cpy = alloc.allocate( nbf*nbf ); - value_type EXC_cpy = *EXC, N_EL_cpy = N_EL; - - MPI_Allreduce( VXC_cpy, VXC, nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( &EXC_cpy, EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( &N_EL_cpy, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - - } - }); - - } - -#endif - -#ifdef GAUXC_ENABLE_MAGMA - // Finalize MAGMA - { - auto ierr = magma_finalize(); - GAUXC_MAGMA_ERROR( "MAGMA Finalize Failed", ierr ); - } -#endif - -} - -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/incore_xc_device_integrator.cxx b/third_party/gauxc/attic/src/new_integrator/device/incore_xc_device_integrator.cxx deleted file mode 100644 index 88c7ffd..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/incore_xc_device_integrator.cxx +++ /dev/null @@ -1,27 +0,0 @@ -#include - -#include "device/incore_xc_device_exc_vxc.hpp" - -namespace GauXC { -namespace detail { - -template -IncoreXCDeviceIntegrator:: - IncoreXCDeviceIntegrator( const IncoreXCDeviceIntegrator& ) = default; - -template -IncoreXCDeviceIntegrator:: - IncoreXCDeviceIntegrator( IncoreXCDeviceIntegrator&& ) noexcept = default; - -template -IncoreXCDeviceIntegrator:: - ~IncoreXCDeviceIntegrator() noexcept = default; - - - - - -template class IncoreXCDeviceIntegrator; - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_incore_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_incore_exc_vxc.hpp deleted file mode 100644 index 314e567..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_incore_exc_vxc.hpp +++ /dev/null @@ -1,93 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include - -#ifdef GAUXC_ENABLE_CUDA -#include "device/cuda/local_work_replicated_incore_exc_vxc.hpp" -#endif - -#ifdef GAUXC_ENABLE_HIP -#include "device/hip/local_work_replicated_incore_exc_vxc.hpp" -#endif - -namespace GauXC::integrator::device { - -using host_task_iterator = std::vector::iterator; - -template -void local_work_replicated_incore_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* exc, - F* n_el -) { - - -#ifdef GAUXC_ENABLE_CUDA - GauXC::integrator::cuda::local_work_replicated_incore_exc_vxc_impl( - weight_alg, state, func, basis, mol, meta, device_data, local_work_begin, - local_work_end, P, VXC, exc, n_el - ); -#endif - -#ifdef GAUXC_ENABLE_HIP - GauXC::integrator::hip::local_work_replicated_incore_exc_vxc_impl( - weight_alg, state, func, basis, mol, meta, device_data, local_work_begin, - local_work_end, P, VXC, exc, n_el - ); -#endif - -} - -template -void local_work_replicated_incore_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - std::vector< XCTask >& tasks, - const F* P, - F* VXC, - F* exc, - F* n_el -) { - - local_work_replicated_incore_exc_vxc_impl( weight_alg, state, func, - basis, mol, meta, device_data, tasks.begin(), tasks.end(), P, VXC, exc, n_el ); - - -} - -template -inline void local_work_replicated_incore_exc_vxc( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - local_work_replicated_incore_exc_vxc_impl( std::forward(args)... ); - else if( n_deriv == 1 ) - local_work_replicated_incore_exc_vxc_impl( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - - -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_shellbatched_exc_vxc.cxx b/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_shellbatched_exc_vxc.cxx deleted file mode 100644 index 56a99d7..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_shellbatched_exc_vxc.cxx +++ /dev/null @@ -1,431 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "device/local_work_replicated_incore_exc_vxc.hpp" -#include "device/local_work_replicated_shellbatched_exc_vxc.hpp" -#include "host/util.hpp" -#include "common/integrator_common.hpp" - -namespace GauXC { -namespace integrator::device { - -struct dev_ex_task { - host_task_iterator task_begin; - host_task_iterator task_end; - std::vector shell_list; -}; - - - - -dev_ex_task generate_dev_batch( const uint32_t nbf_threshold, - host_task_iterator task_begin, - host_task_iterator local_work_end, - const BasisSet& basis, - util::Timer& timer ) { - - - auto nbe_comparator = []( const auto& task_a, const auto& task_b ) { - return task_a.nbe < task_b.nbe; - }; - - // Find task with largest NBE - auto max_task = timer.time_op_accumulate("XCIntegrator.MaxTask", [&]() { - return std::max_element( task_begin, local_work_end, nbe_comparator ); - } ); - - const auto max_shell_list = max_task->shell_list; // copy for reset - - // Init uniion shell list to max shell list outside of loop - std::set union_shell_set(max_shell_list.begin(), - max_shell_list.end()); - - - - size_t n_overlap_pthresh = 20; - double overlap_pthresh_delta = 1. / n_overlap_pthresh; - std::vector overlap_pthresh; - for( int i = 1; i < n_overlap_pthresh; ++i ) - overlap_pthresh.emplace_back( i*overlap_pthresh_delta ); - - std::vector overlap_pthresh_idx( overlap_pthresh.size() ); - std::iota( overlap_pthresh_idx.begin(), overlap_pthresh_idx.end(), 0 ); - - std::map> - cached_task_ends; - - int cur_partition_pthresh_idx = -1; - - auto _it = std::partition_point( overlap_pthresh_idx.rbegin(), - overlap_pthresh_idx.rend(), - [&](int idx) { - - uint32_t overlap_threshold = - std::max(1., max_shell_list.size() * overlap_pthresh[idx] ); - - - host_task_iterator search_st = task_begin; - host_task_iterator search_en = local_work_end; - - // Make a local copy of union list - std::set local_union_shell_set; - - // Attempt to limit task search based on current partition - if( cur_partition_pthresh_idx >= 0 ) { - - const auto& last_pthresh = - cached_task_ends.at(cur_partition_pthresh_idx); - - if( cur_partition_pthresh_idx > idx ) { - search_st = last_pthresh.first; - local_union_shell_set = last_pthresh.second; - } else { - search_en = last_pthresh.first; - local_union_shell_set = union_shell_set; - } - - } else { - local_union_shell_set = union_shell_set; - } - - - // Partition tasks into those which overlap max_task up to - // specified threshold - auto task_end = - timer.time_op_accumulate("XCIntegrator.TaskIntersection", [&]() { - return std::partition( search_st, search_en, [&](const auto& t) { - return util::integral_list_intersect( max_shell_list, t.shell_list, - overlap_threshold ); - } ); - } ); - - - - // Take union of shell list for all overlapping tasks - timer.time_op_accumulate("XCIntegrator.ShellListUnion",[&]() { - for( auto task_it = search_st; task_it != task_end; ++task_it ) { - local_union_shell_set.insert( task_it->shell_list.begin(), - task_it->shell_list.end() ); - } - } ); - - auto cur_nbe = basis.nbf_subset( local_union_shell_set.begin(), - local_union_shell_set.end() ); - - //std::cout << " Threshold % = " << std::setw(5) << overlap_pthresh[idx] << ", "; - //std::cout << " Overlap Threshold = " << std::setw(8) << overlap_threshold << ", "; - //std::cout << " Current NBE = " << std::setw(8) << cur_nbe << std::endl; - - // Cache the data - cached_task_ends[idx] = std::make_pair( task_end, local_union_shell_set ); - - // Update partitioned threshold - cur_partition_pthresh_idx = idx; - - return cur_nbe < nbf_threshold; - - } ); - - host_task_iterator task_end; - auto _idx_partition = (_it == overlap_pthresh_idx.rend()) ? 0 : *_it; - std::tie( task_end, union_shell_set ) = cached_task_ends.at(_idx_partition); - - - - - - //std::cout << "FOUND " << std::distance( task_begin, task_end ) - // << " OVERLAPPING TASKS" << std::endl; - - - std::vector union_shell_list( union_shell_set.begin(), - union_shell_set.end() ); - - // Try to add additional tasks given current union list - task_end = timer.time_op_accumulate("XCIntegrator.SubtaskGeneration", [&]() { - return std::partition( task_end, local_work_end, [&]( const auto& t ) { - return util::list_subset( union_shell_list, t.shell_list ); - } ); - } ); - - //std::cout << "FOUND " << std::distance( task_begin, task_end ) - // << " SUBTASKS" << std::endl; - - - dev_ex_task ex_task; - ex_task.task_begin = task_begin; - ex_task.task_end = task_end; - ex_task.shell_list = std::move( union_shell_list ); - - return ex_task; - -} - -template -void device_execute_shellbatched( - util::Timer& timer, - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - const F* P, - F* VXC, - F* EXC, - F* NEL, - const dev_ex_task& ex_task_obj -) { - - // Alias information - auto task_begin = ex_task_obj.task_begin; - auto task_end = ex_task_obj.task_end; - auto& union_shell_list = ex_task_obj.shell_list; - - const auto natoms = mol.natoms(); - - // Extract subbasis - BasisSet basis_subset; basis_subset.reserve(union_shell_list.size()); - timer.time_op_accumulate("XCIntegrator.CopySubBasis",[&]() { - for( auto i : union_shell_list ) { - basis_subset.emplace_back( basis.at(i) ); - } - //basis_subset.generate_shell_to_ao(); - }); - - // Setup basis maps - BasisSetMap basis_map( basis ); - - const size_t nshells = basis_subset.size(); - const size_t nbe = basis_subset.nbf(); - std::cout << "TASK_UNION HAS:" << std::endl - << " NSHELLS = " << nshells << std::endl - << " NBE = " << nbe << std::endl; - - // Recalculate shell_list based on subbasis - timer.time_op_accumulate("XCIntegrator.RecalcShellList",[&]() { - for( auto _it = task_begin; _it != task_end; ++_it ) { - auto union_list_idx = 0; - auto& cur_shell_list = _it->shell_list; - for( auto j = 0; j < cur_shell_list.size(); ++j ) { - while( union_shell_list[union_list_idx] != cur_shell_list[j] ) - union_list_idx++; - cur_shell_list[j] = union_list_idx; - } - } - } ); - - - - // Allocate host temporaries - std::vector P_submat_host(nbe*nbe), VXC_submat_host(nbe*nbe); - F EXC_tmp, NEL_tmp; - F* P_submat = P_submat_host.data(); - F* VXC_submat = VXC_submat_host.data(); - - // Extract subdensity - std::vector> union_submat_cut; - std::vector foo; - //auto [union_submat_cut, foo] = - std::tie(union_submat_cut,foo) = - integrator::gen_compressed_submat_map( basis_map, union_shell_list, - basis.nbf(), basis.nbf() ); - - timer.time_op_accumulate("XCIntegrator.ExtractSubDensity",[&]() { - detail::submat_set( basis.nbf(), basis.nbf(), nbe, nbe, P, basis.nbf(), - P_submat, nbe, union_submat_cut ); - } ); - - - // Allocate static quantities on device stack - device_data.allocate_static_data( natoms, n_deriv, nbe, nshells ); - - // Process batches on device with subobjects - local_work_replicated_incore_exc_vxc_impl( - weight_alg, state, func, basis_subset, mol, meta, device_data, - task_begin, task_end, P_submat, VXC_submat, &EXC_tmp, &NEL_tmp - ); - - // Update full quantities - *EXC += EXC_tmp; - *NEL += NEL_tmp; - timer.time_op_accumulate("XCIntegrator.IncrementSubPotential",[&]() { - detail::inc_by_submat( basis.nbf(), basis.nbf(), nbe, nbe, VXC, basis.nbf(), - VXC_submat, nbe, union_submat_cut ); - }); - - - // Reset shell_list to be wrt full basis - timer.time_op_accumulate("XCIntegrator.ResetShellList",[&]() { - for( auto _it = task_begin; _it != task_end; ++_it ) - for( auto j = 0; j < _it->shell_list.size(); ++j ) { - _it->shell_list[j] = union_shell_list[_it->shell_list[j]]; - } - }); - -} - - - - - -template -void local_work_replicated_shellbatched_exc_vxc_impl( - util::Timer& timer, - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* EXC, - F* NEL -) { - - const uint32_t nbf_threshold = 8000; - std::cout << "IN SHELL BATCHED\n" << std::flush; - std::cout << "TOTAL NTASKS = " << std::distance( local_work_begin, local_work_end ) << std:: endl; - std::cout << "TOTAL NBF = " << basis.nbf() << std::endl; - std::cout << "NBF THRESH = " << nbf_threshold << std::endl; - - - // Zero out final results - timer.time_op( "XCIntegrator.ZeroHost", [&]() { - *EXC = 0.; - *NEL = 0.; - std::memset( VXC, 0, basis.nbf()*basis.nbf()*sizeof(F) ); - }); - -#if 0 - size_t nbf = basis.nbf(); - size_t nshells = basis.nshells(); - size_t natoms = mol.size(); - - // Allocate static quantities on device stack - device_data.allocate_static_data( natoms, n_deriv, nbf, nshells ); - - process_batches_cuda_replicated_density_incore_p( - weight_alg, func, basis, mol, meta, device_data, - local_work_begin, local_work_end, P, VXC, EXC, NEL - ); -#else - - auto nbe_comparator = []( const auto& task_a, const auto& task_b ) { - return task_a.nbe < task_b.nbe; - }; - - - size_t batch_iter = 0; - auto task_begin = local_work_begin; - - const size_t natoms = mol.size(); - - //std::future device_ex; - - std::cout << "MASTER THREAD ID = " << std::this_thread::get_id() << std::endl; - std::queue< dev_ex_task > dev_tasks; - - auto execute_device_task = [&] () { - - if( dev_tasks.empty() ) return; - - std::cout << "Executing device tasks on thread " << std::this_thread::get_id() << std::endl; - - dev_ex_task batch_task = std::move( dev_tasks.front() ); // Move task to local scope - dev_tasks.pop(); // Remove from queue - - // Execute task - timer.time_op_accumulate( "XCIntegrator.DeviceWork", [&]() { - device_execute_shellbatched( timer, weight_alg, state, func, basis, mol, - meta, device_data, P, VXC, EXC, NEL, - batch_task ); - }); - - - }; - - std::future dev_future; - while( task_begin != local_work_end ) { - - // Generate task - dev_tasks.emplace( generate_dev_batch( nbf_threshold, task_begin, - local_work_end, basis, timer ) ); - - if( not dev_future.valid() ) { - dev_future = std::async( std::launch::async, execute_device_task ); - } else { - auto status = dev_future.wait_for( std::chrono::milliseconds(5) ); - if( status == std::future_status::ready ) { - dev_future.get(); - dev_future = std::async( std::launch::async, execute_device_task ); - } - } - - // Update task iterator for next set of batches - task_begin = dev_tasks.back().task_end; - - } - - - if( dev_future.valid() ) dev_future.wait(); - - // TODO: Try to merge tasks if possible - //for( auto _task_it = dev_tasks.begin(); _task_it != dev_tasks.end()-1; ++_task_it ) { - // const auto& shell_list = _task_it->union_shell_list; - // auto task_nbe = basis.nbf_subset( shell_list.begin(), shell_list.end() ); - // auto _merge_it = _task_it + 1; - // while( task_nbe <= nbf_threshold and _merge_it != dev_tasks.end() ) { - // _merge_it = std::find_if( _merge_it, dev_tasks.end(), [&]( const auto& t ) { - // const auto& local_shell_list - // } ); - // } - //} - - while( not dev_tasks.empty() ) { - // Execute remaining tasks - execute_device_task(); - } - - - -#endif - -} - - -#define CUDA_IMPL( F, ND ) \ -template \ -void local_work_replicated_shellbatched_exc_vxc_impl(\ - util::Timer& timer,\ - XCWeightAlg weight_alg,\ - XCIntegratorState state,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCDeviceData & device_data,\ - host_task_iterator local_work_begin,\ - host_task_iterator local_work_end,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -CUDA_IMPL( double, 0 ); -CUDA_IMPL( double, 1 ); - -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_shellbatched_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_shellbatched_exc_vxc.hpp deleted file mode 100644 index e3e24c7..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/local_work_replicated_shellbatched_exc_vxc.hpp +++ /dev/null @@ -1,76 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include - -#include - -#include "device/xc_device_data.hpp" - -namespace GauXC::integrator::device { - -using host_task_iterator = std::vector::iterator; - -template -void local_work_replicated_shellbatched_exc_vxc_impl( - util::Timer& timer, - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - host_task_iterator local_work_begin, - host_task_iterator local_work_end, - const F* P, - F* VXC, - F* exc, - F* n_el -); - -template -void local_work_replicated_shellbatched_exc_vxc_impl( - util::Timer& timer, - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCDeviceData & device_data, - std::vector< XCTask >& tasks, - const F* P, - F* VXC, - F* exc, - F* n_el -) { - - local_work_replicated_shellbatched_exc_vxc_impl( timer, weight_alg, - state, func, basis, mol, meta, device_data, tasks.begin(), tasks.end(), - P, VXC, exc, n_el ); - -} - - - -template -inline void local_work_replicated_shellbatched_exc_vxc( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - local_work_replicated_shellbatched_exc_vxc_impl( std::forward(args)... ); - else if( n_deriv == 1 ) - local_work_replicated_shellbatched_exc_vxc_impl( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - - -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/shellbatched_xc_device_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/device/shellbatched_xc_device_exc_vxc.hpp deleted file mode 100644 index 6b9efe6..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/shellbatched_xc_device_exc_vxc.hpp +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include - -#include "device/local_work_replicated_shellbatched_exc_vxc.hpp" -#include -#include -#include "exceptions/magma_exception.hpp" - -namespace GauXC { -namespace detail { - -template -void ShellBatchedXCDeviceIntegrator:: - eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) { - -#ifdef GAUXC_ENABLE_MAGMA - // Initialize MAGMA - { - auto ierr = magma_init(); - GAUXC_MAGMA_ERROR( "MAGMA Init Failed", ierr ); - } -#endif - - util::unused(m,n,ldp,ldvxc); - - size_t nbf = this->basis_->nbf(); - - //// TODO: Check that P is sane - - - // Generate Tasks - auto& tasks = this->load_balancer_->get_tasks(); - size_t n_deriv = this->func_->is_gga() ? 1 : 0; - - // Allocate Memory - auto device_data = this->timer_.time_op("XCIntegrator.DeviceAlloc",[&]() { - return GauXC::integrator::device::make_device_data(); - }); - - value_type N_EL; - - // Compute Local contributions to EXC / VXC - this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - GauXC::integrator::device::local_work_replicated_shellbatched_exc_vxc< value_type >( - n_deriv, this->timer_, XCWeightAlg::SSF, state_, *this->func_, - *this->basis_, this->load_balancer_->molecule(), - this->load_balancer_->molmeta(), *device_data, tasks, P, - VXC, EXC, &N_EL - ); - }); - - // Update State of Integrator - state_.load_balancer_populated = true; - //state_.modified_weights_are_stored = true; - - -#ifdef GAUXC_ENABLE_MPI - - int world_size; - MPI_Comm_size( this->comm_, &world_size ); - - if( world_size > 1 ) { - - this->timer_.time_op("XCIntegrator.Allreduce", [&](){ - // Test of communicator is an inter-communicator - // XXX: Can't think of a case when this would be true, but who knows... - int inter_flag; - MPI_Comm_test_inter( this->comm_, &inter_flag ); - - // Is Intra-communicator, Allreduce can be done inplace - if( not inter_flag ) { - - MPI_Allreduce( MPI_IN_PLACE, VXC, nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - // Isn't Intra-communicator (weird), Allreduce can't be done inplace - } else { - - std::allocator alloc; - auto VXC_cpy = alloc.allocate( nbf*nbf ); - value_type EXC_cpy = *EXC, N_EL_cpy = N_EL; - - MPI_Allreduce( VXC_cpy, VXC, nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( &EXC_cpy, EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( &N_EL_cpy, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - - } - }); - - } - -#endif - -#ifdef GAUXC_ENABLE_MAGMA - // Finalize MAGMA - { - auto ierr = magma_finalize(); - GAUXC_MAGMA_ERROR( "MAGMA Finalize Failed", ierr ); - } -#endif - -} - -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/device/shellbatched_xc_device_integrator.cxx b/third_party/gauxc/attic/src/new_integrator/device/shellbatched_xc_device_integrator.cxx deleted file mode 100644 index 782a387..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/shellbatched_xc_device_integrator.cxx +++ /dev/null @@ -1,27 +0,0 @@ -#include - -#include "device/shellbatched_xc_device_exc_vxc.hpp" - -namespace GauXC { -namespace detail { - -template -ShellBatchedXCDeviceIntegrator:: - ShellBatchedXCDeviceIntegrator( const ShellBatchedXCDeviceIntegrator& ) = default; - -template -ShellBatchedXCDeviceIntegrator:: - ShellBatchedXCDeviceIntegrator( ShellBatchedXCDeviceIntegrator&& ) noexcept = default; - -template -ShellBatchedXCDeviceIntegrator:: - ~ShellBatchedXCDeviceIntegrator() noexcept = default; - - - - - -template class ShellBatchedXCDeviceIntegrator; - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/device/xc_device_data.hpp b/third_party/gauxc/attic/src/new_integrator/device/xc_device_data.hpp deleted file mode 100644 index d299d63..0000000 --- a/third_party/gauxc/attic/src/new_integrator/device/xc_device_data.hpp +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include -#include - -namespace GauXC { - -template -class XCDeviceData { - -public: - - virtual void allocate_static_data( size_t _natoms, - size_t _n_deriv, - size_t _nbf, - size_t _nshells ) = 0; - - virtual ~XCDeviceData() noexcept = default; - -}; - -namespace integrator::device { - - template - std::shared_ptr> make_device_data(); - - extern template std::shared_ptr> make_device_data(); - -} - - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/blas.cxx b/third_party/gauxc/attic/src/new_integrator/host/blas.cxx deleted file mode 100644 index b7126e4..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/blas.cxx +++ /dev/null @@ -1,214 +0,0 @@ -#include "host/blas.hpp" -#include -#include - -extern "C" { - -//void dlacpy_( const char* UPLO, const int* M, const int* N, const double* A, -// const int* LDA, double* B, const int* LDB ); -//void slacpy_( const char* UPLO, const int* M, const int* N, const float* A, -// const int* LDA, float* B, const int* LDB ); - -void dgemm_( const char* TA, const char* TB, const int* M, const int* N, - const int* K, const double* ALPHA, const double* A, - const int* LDA, const double* B, const int* LDB, - const double* BETA, double* C, const int* LDC ); -void sgemm_( const char* TA, const char* TB, const int* M, const int* N, - const int* K, const float* ALPHA, const float* A, - const int* LDA, const float* B, const int* LDB, - const float* BETA, float* C, const int* LDC ); - -void dsyr2k_( const char* UPLO, const char* TRANS, const int* N, const int* K, - const double* ALPHA, const double* A, const int* LDA, const double* B, - const int* LDB, const double* BETA, double* C, const int* LDC ); -void ssyr2k_( const char* UPLO, const char* TRANS, const int* N, const int* K, - const float* ALPHA, const float* A, const int* LDA, const float* B, - const int* LDB, const float* BETA, float* C, const int* LDC ); - -double ddot_( const int* N, const double* X, const int* INCX, const double* Y, - const int* INCY ); -float sdot_( const int* N, const float* X, const int* INCX, const float* Y, - const int* INCY ); - - -void daxpy_( const int* N, const double* ALPHA, const double* A, const int* INCX, - double* Y, const int* INCY ); -void saxpy_( const int* N, const float* ALPHA, const float* A, const int* INCX, - float* Y, const int* INCY ); - -void dscal_( const int* N, const double* ALPHA, const double* X, const int* INCX ); -void sscal_( const int* N, const float* ALPHA, const float* X, const int* INCX ); -} - -namespace GauXC::blas { - -template -void lacpy( char UPLO, int M, int N, const T* A, int LDA, T* B, - int LDB ) { - -/* - if constexpr ( std::is_same_v ) - slacpy_( &UPLO, &M, &N, A, &LDA, B, &LDB ); - else if constexpr ( std::is_same_v ) - dlacpy_( &UPLO, &M, &N, A, &LDA, B, &LDB ); - else throw std::runtime_error("LACPY NYI"); -*/ - - if( UPLO == 'L' ) { - - for( int j = 0; j < N; ++j ) - for( int i = j; i < M; ++i ) - B[i + j*LDB] = A[i + j*LDA]; - - } else if( UPLO == 'U' ) { - - for( int j = 0; j < N; ++j ) - for( int i = 0; i <= j; ++i ) - B[i + j*LDB] = A[i + j*LDA]; - - } else { - - for( int j = 0; j < N; ++j ) - for( int i = 0; i < M; ++i ) - B[i + j*LDB] = A[i + j*LDA]; - - } - -} - -template void lacpy( char UPLO, int M, int N, const float* A, int LDA, - float* B, int LDB ); -template void lacpy( char UPLO, int M, int N, const double* A, int LDA, - double* B, int LDB ); - - - - - - - - - -template -void gemm( char TA, char TB, int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ) { - - - if constexpr ( std::is_same_v ) - sgemm_( &TA, &TB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else if constexpr ( std::is_same_v ) - dgemm_( &TA, &TB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else throw std::runtime_error("GEMM NYI"); - - -} -template -void gemm( char floatA, char floatB, int M, int N, int K, float ALPHA, - const float* A, int LDA, const float* B, int LDB, float BETA, - float* C, int LDC ); -template -void gemm( char doubleA, char doubleB, int M, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, double BETA, - double* C, int LDC ); - - - - - - - -template -void syr2k( char UPLO, char TRANS, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ) { - - - if constexpr ( std::is_same_v ) - ssyr2k_( &UPLO, &TRANS, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else if constexpr ( std::is_same_v ) - dsyr2k_( &UPLO, &TRANS, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC ); - else throw std::runtime_error("SYR2K NYI"); - - -} - -template -void syr2k( char UPLO, char floatRANS, int N, int K, float ALPHA, - const float* A, int LDA, const float* B, int LDB, float BETA, - float* C, int LDC ); -template -void syr2k( char UPLO, char doubleRANS, int N, int K, double ALPHA, - const double* A, int LDA, const double* B, int LDB, double BETA, - double* C, int LDC ); - - - - - - - -template -T dot( int N, const T* X, int INCX, const T* Y, int INCY ) { - - if constexpr ( std::is_same_v ) - return sdot_(&N, X, &INCX, Y, &INCY); - else if constexpr ( std::is_same_v ) - return ddot_(&N, X, &INCX, Y, &INCY); - else throw std::runtime_error("DOT NYI"); - - return 0.; -} - -template -float dot( int N, const float* X, int INCX, const float* Y, int INCY ); -template -double dot( int N, const double* X, int INCX, const double* Y, int INCY ); - - - - - - -template -void axpy( int N, T ALPHA, const T* X, int INCX, T* Y, int INCY ) { - - if constexpr ( std::is_same_v ) - saxpy_(&N, &ALPHA, X, &INCX, Y, &INCY ); - else if constexpr ( std::is_same_v ) - daxpy_(&N, &ALPHA, X, &INCX, Y, &INCY ); - else throw std::runtime_error("AXPY NYI"); - -} - -template -void axpy( int N, float ALPHA, const float* A, int INCX, float* Y, - int INCY ); -template -void axpy( int N, double ALPHA, const double* A, int INCX, double* Y, - int INCY ); - - - - - - -template -void scal( int N, T ALPHA, T* X, int INCX ) { - - if constexpr ( std::is_same_v ) - sscal_(&N, &ALPHA, X, &INCX ); - else if constexpr ( std::is_same_v ) - dscal_(&N, &ALPHA, X, &INCX ); - else throw std::runtime_error("SCAL NYI"); - -} - -template -void scal( int N, float ALPHA, float* X, int INCX ); -template -void scal( int N, double ALPHA, double* X, int INCX ); - -} - - diff --git a/third_party/gauxc/attic/src/new_integrator/host/blas.hpp b/third_party/gauxc/attic/src/new_integrator/host/blas.hpp deleted file mode 100644 index add036a..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/blas.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include - -namespace GauXC::blas { - -template -void lacpy( char UPLO, int M, int N, const T* A, int LDA, T* B, - int LDB ); - -template -void gemm( char TA, char TB, int M, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ); - -template -void syr2k( char UPLO, char TRANS, int N, int K, T ALPHA, - const T* A, int LDA, const T* B, int LDB, T BETA, - T* C, int LDC ); - - -template -T dot( int N, const T* X, int INCX, const T* Y, int INCY ); - -template -void axpy( int N, T ALPHA, const T* X, int INCX, T* Y, int INCY ); - -template -void scal( int N, T ALPHA, T* X, int INCX ); - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/gauxc-host.cmake b/third_party/gauxc/attic/src/new_integrator/host/gauxc-host.cmake deleted file mode 100644 index 84cb2ab..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/gauxc-host.cmake +++ /dev/null @@ -1,23 +0,0 @@ -find_package( LAPACK REQUIRED ) -include( gauxc-gau2grid ) -target_sources( gauxc PRIVATE - # Common Host Utilities - host/host_weights.cxx - host/host_collocation.cxx - host/blas.cxx - - # XC Specific - host/host_exc_vxc_zmat.cxx - host/local_work_replicated_exc_vxc.cxx - - # Interfaces - host/reference_xc_host_integrator.cxx -) - -target_link_libraries( gauxc PUBLIC LAPACK::LAPACK ) - -if( GAUXC_ENABLE_GAU2GRID ) - target_link_libraries( gauxc PUBLIC gau2grid::gg ) -endif() - - diff --git a/third_party/gauxc/attic/src/new_integrator/host/host_collocation.cxx b/third_party/gauxc/attic/src/new_integrator/host/host_collocation.cxx deleted file mode 100644 index 8edf654..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/host_collocation.cxx +++ /dev/null @@ -1,137 +0,0 @@ -#include "host/host_collocation.hpp" - - -#ifdef GAUXC_ENABLE_GAU2GRID - #include "gau2grid/gau2grid.h" -#else - #include "collocation/collocation_angular_cartesian.hpp" - #include "collocation/collocation_angular_spherical_unnorm.hpp" - #include "collocation/collocation_radial.hpp" -#endif - -namespace GauXC::integrator::host { - -void eval_collocation( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval ) { - -#ifdef GAUXC_ENABLE_GAU2GRID - - std::allocator a; - auto* rv = a.allocate( npts * nbe ); - - size_t ncomp = 0; - for( size_t i = 0; i < nshells; ++i ) { - - const auto& sh = basis.at(shell_mask[i]); - int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; - gg_collocation( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(), - sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts ); - - ncomp += sh.size(); - - } - - gg_fast_transpose( ncomp, npts, rv, basis_eval ); - a.deallocate( rv, npts*nbe ); - -#else - - for( size_t ipt = 0; ipt < npts; ++ipt ) - for( size_t i = 0; i < nshells; ++i ) { - - const auto ish = shell_mask[i]; - const auto& sh = basis.at(ish); - auto* eval = basis_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - - double x,y,z, bf; - integrator::cuda::collocation_device_radial_eval( sh, points + 3*ipt, - &x, &y, &z, &bf ); - - if( sh.pure() ) - integrator::cuda::collocation_spherical_unnorm_angular( sh.l(), bf, x, y, z, - eval ); - else - integrator::cuda::collocation_cartesian_angular( sh.l(), bf, x, y, z, eval ); - - - } - -#endif - -} - -void eval_collocation_deriv1( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval, - double* dbasis_x_eval, - double* dbasis_y_eval, - double* dbasis_z_eval ) { - -#ifdef GAUXC_ENABLE_GAU2GRID - - std::allocator a; - auto* rv = a.allocate( 4 * npts * nbe ); - auto* rv_x = rv + npts * nbe; - auto* rv_y = rv_x + npts * nbe; - auto* rv_z = rv_y + npts * nbe; - - size_t ncomp = 0; - for( size_t i = 0; i < nshells; ++i ) { - - const auto& sh = basis.at(shell_mask[i]); - int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; - gg_collocation_deriv1( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(), - sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts, - rv_x + ncomp*npts, rv_y + ncomp*npts, rv_z + ncomp*npts ); - - ncomp += sh.size(); - - } - - gg_fast_transpose( ncomp, npts, rv, basis_eval ); - gg_fast_transpose( ncomp, npts, rv_x, dbasis_x_eval ); - gg_fast_transpose( ncomp, npts, rv_y, dbasis_y_eval ); - gg_fast_transpose( ncomp, npts, rv_z, dbasis_z_eval ); - - a.deallocate( rv, 4*npts*nbe ); - -#else - - for( size_t ipt = 0; ipt < npts; ++ipt ) - for( size_t i = 0; i < nshells; ++i ) { - - const auto ish = shell_mask[i]; - const auto& sh = basis.at(ish); - auto* eval = basis_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - auto* deval_x = dbasis_x_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - auto* deval_y = dbasis_y_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - auto* deval_z = dbasis_z_eval + ipt*nbe + basis.shell_to_first_ao( ish ); - - double x,y,z, bf, dbf_x, dbf_y, dbf_z; - integrator::cuda::collocation_device_radial_eval_deriv1( sh, points + 3*ipt, - &x, &y, &z, &bf, &dbf_x, - &dbf_y, &dbf_z); - - if( sh.pure() ) - integrator::cuda::collocation_spherical_unnorm_angular_deriv1( - sh.l(), bf, dbf_x, dbf_y, dbf_z, x, y, z, eval, deval_x, deval_y, deval_z ); - else - integrator::cuda::collocation_cartesian_angular_deriv1( - sh.l(), bf, dbf_x, dbf_y, dbf_z, x, y, z, eval, deval_x, deval_y, deval_z ); - - } - -#endif -} - - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/host_collocation.hpp b/third_party/gauxc/attic/src/new_integrator/host/host_collocation.hpp deleted file mode 100644 index 536ba26..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/host_collocation.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - -namespace GauXC::integrator::host { - -void eval_collocation( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval ); - -void eval_collocation_deriv1( size_t npts, - size_t nshells, - size_t nbe, - const double* points, - const BasisSet& basis, - const int32_t* shell_mask, - double* basis_eval, - double* dbasis_x_eval, - double* dbasis_y_eval, - double* dbasis_z_eval ); - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/host_exc_vxc_zmat.cxx b/third_party/gauxc/attic/src/new_integrator/host/host_exc_vxc_zmat.cxx deleted file mode 100644 index 3e5d582..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/host_exc_vxc_zmat.cxx +++ /dev/null @@ -1,115 +0,0 @@ -#include "host/host_exc_vxc_zmat.hpp" -#include "host/blas.hpp" - -namespace GauXC { -namespace integrator::host { - -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* basis, - F* z_matrix ) { - - GauXC::blas::lacpy( 'A', nbf, npts, basis, nbf, - z_matrix, nbf ); - - for( int32_t i = 0; i < npts; ++i ) { - - auto* z_col = z_matrix + i*nbf; - - const F fact = 0.5 * vrho[i]; - GauXC::blas::scal( nbf, fact, z_col, 1 ); - - } - -} - -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const float* vrho, - const float* basis, - float* z_matrix ); -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const double* vrho, - const double* basis, - double* z_matrix ); - - - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* vgamma, - const F* basis, - const F* dbasis_x, - const F* dbasis_y, - const F* dbasis_z, - const F* dden_x, - const F* dden_y, - const F* dden_z, - F* z_matrix ) { - - GauXC::blas::lacpy( 'A', nbf, npts, basis, nbf, - z_matrix, nbf ); - - for( int32_t i = 0; i < npts; ++i ) { - - const int32_t ioff = i * nbf; - - auto* z_col = z_matrix + ioff; - auto* bf_x_col = dbasis_x + ioff; - auto* bf_y_col = dbasis_y + ioff; - auto* bf_z_col = dbasis_z + ioff; - - const F lda_fact = 0.5 * vrho[i]; - GauXC::blas::scal( nbf, lda_fact, z_col, 1 ); - - const F gga_fact = 2. * vgamma[i]; - const auto x_fact = gga_fact * dden_x[i]; - const auto y_fact = gga_fact * dden_y[i]; - const auto z_fact = gga_fact * dden_z[i]; - - GauXC::blas::axpy( nbf, x_fact, bf_x_col, 1, z_col, 1 ); - GauXC::blas::axpy( nbf, y_fact, bf_y_col, 1, z_col, 1 ); - GauXC::blas::axpy( nbf, z_fact, bf_z_col, 1, z_col, 1 ); - - } - -} - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const float* vrho, - const float* vgamma, - const float* basis, - const float* dbasis_x, - const float* dbasis_y, - const float* dbasis_z, - const float* dden_x, - const float* dden_y, - const float* dden_z, - float* z_matrix ); - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const double* vrho, - const double* vgamma, - const double* basis, - const double* dbasis_x, - const double* dbasis_y, - const double* dbasis_z, - const double* dden_x, - const double* dden_y, - const double* dden_z, - double* z_matrix ); - -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/host/host_exc_vxc_zmat.hpp b/third_party/gauxc/attic/src/new_integrator/host/host_exc_vxc_zmat.hpp deleted file mode 100644 index ba33541..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/host_exc_vxc_zmat.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once -#include - -namespace GauXC { -namespace integrator::host { - -template -void zmat_lda_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* basis, - F* z_matrix ); - -template -void zmat_gga_host( int32_t npts, - int32_t nbf, - const F* vrho, - const F* vgamma, - const F* basis, - const F* dbasis_x, - const F* dbasis_y, - const F* dbasis_z, - const F* dden_x, - const F* dden_y, - const F* dden_z, - F* z_matrix ); - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/host_weights.cxx b/third_party/gauxc/attic/src/new_integrator/host/host_weights.cxx deleted file mode 100644 index d8d4785..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/host_weights.cxx +++ /dev/null @@ -1,205 +0,0 @@ -#include "host/host_weights.hpp" -#include "common/integrator_constants.hpp" - -namespace GauXC::integrator::host { - -void ssf_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -); - -void becke_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -); - -void partition_weights_host( - XCWeightAlg weight_alg, - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -) { - - switch( weight_alg ) { - case XCWeightAlg::Becke: - becke_weights_host( mol, meta, tasks ); - break; - case XCWeightAlg::SSF: - ssf_weights_host( mol, meta, tasks ); - break; - default: - throw std::runtime_error("Weight Alg Not Supported"); - } - -} - -void becke_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -) { - - // Becke partition functions - auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 - auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 - - const size_t ntasks = tasks.size(); - const size_t natoms = mol.natoms(); - - const auto& RAB = meta.rab(); - - #pragma omp parallel - { - - std::vector partitionScratch( natoms ); - std::vector atomDist( natoms ); - - #pragma omp for - for( size_t iT = 0; iT < ntasks; ++iT ) - for( size_t i = 0; i < tasks[iT].points.size(); ++i ) { - - auto& task = tasks[iT]; - auto& weight = task.weights[i]; - const auto& point = task.points[i]; - - // Compute distances of each center to point - for(size_t iA = 0; iA < natoms; iA++) { - - const double da_x = point[0] - mol[iA].x; - const double da_y = point[1] - mol[iA].y; - const double da_z = point[2] - mol[iA].z; - - atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); - - } - - // Evaluate unnormalized partition functions - std::fill(partitionScratch.begin(),partitionScratch.end(),1.); - for( size_t iA = 0; iA < natoms; iA++ ) - for( size_t jA = 0; jA < iA; jA++ ){ - const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; - const double g = gBecke(mu); - - partitionScratch[iA] *= 0.5 * (1. - g); - partitionScratch[jA] *= 0.5 * (1. + g); - } - - // Normalization - double sum = 0.; - for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; - - // Update Weights - weight *= partitionScratch[task.iParent] / sum; - - } // Collapsed loop over tasks and points - - } // OMP context - - -} - -void ssf_weights_host( - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -) { - - auto gFrisch = [&](double x) { - const double s_x = x / magic_ssf_factor<>; - const double s_x2 = s_x * s_x; - const double s_x3 = s_x * s_x2; - const double s_x5 = s_x3 * s_x2; - const double s_x7 = s_x5 * s_x2; - - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; - }; - - const size_t ntasks = tasks.size(); - const size_t natoms = mol.natoms(); - - const auto& RAB = meta.rab(); - - #pragma omp parallel - { - - std::vector partitionScratch( natoms ); - std::vector atomDist( natoms ); - - #pragma omp for - for( size_t iT = 0; iT < ntasks; ++iT ) - for( size_t i = 0; i < tasks[iT].points.size(); ++i ) { - - auto& task = tasks[iT]; - auto& weight = task.weights[i]; - const auto& point = task.points[i]; - - const auto dist_cutoff = 0.5 * (1-magic_ssf_factor<>) * task.dist_nearest; - - // Compute dist to parent atom - { - const double da_x = point[0] - mol[task.iParent].x; - const double da_y = point[1] - mol[task.iParent].y; - const double da_z = point[2] - mol[task.iParent].z; - - atomDist[task.iParent] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); - } - - if( atomDist[task.iParent] < dist_cutoff ) continue; // Partition weight = 1 - - // Compute distances of each center to point - for(size_t iA = 0; iA < natoms; iA++) { - - if( iA == (size_t)task.iParent ) continue; - - const double da_x = point[0] - mol[iA].x; - const double da_y = point[1] - mol[iA].y; - const double da_z = point[2] - mol[iA].z; - - atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); - - } - - // Evaluate unnormalized partition functions - std::fill(partitionScratch.begin(),partitionScratch.end(),1.); - for( size_t iA = 0; iA < natoms; iA++ ) - for( size_t jA = 0; jA < iA; jA++ ) - if( partitionScratch[iA] > ssf_weight_tol or - partitionScratch[jA] > ssf_weight_tol ) { - - const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; - - if( mu <= -magic_ssf_factor<> ) { - - partitionScratch[jA] = 0.; - - } else if (mu >= magic_ssf_factor<>) { - - partitionScratch[iA] = 0.; - - } else { - - double g = 0.5 * ( 1. - gFrisch(mu) ); - partitionScratch[iA] *= g; - partitionScratch[jA] *= 1. - g; - - } - - } - - // Normalization - double sum = 0.; - for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; - - // Update Weights - weight *= partitionScratch[task.iParent] / sum; - - } // Collapsed loop over tasks and points - - } // OMP context - - -} - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/host_weights.hpp b/third_party/gauxc/attic/src/new_integrator/host/host_weights.hpp deleted file mode 100644 index 11736de..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/host_weights.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include - -namespace GauXC::integrator::host { - -void partition_weights_host( - XCWeightAlg weight_alg, - const Molecule& mol, - const MolMeta& meta, - std::vector< XCTask >& tasks -); - - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/local_work_replicated_exc_vxc.cxx b/third_party/gauxc/attic/src/new_integrator/host/local_work_replicated_exc_vxc.cxx deleted file mode 100644 index 84f7fe7..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/local_work_replicated_exc_vxc.cxx +++ /dev/null @@ -1,207 +0,0 @@ -#include "local_work_replicated_exc_vxc.hpp" - -#include "host/host_weights.hpp" -#include "host/host_collocation.hpp" -#include "host/host_exc_vxc_zmat.hpp" -#include "common/integrator_common.hpp" -#include "host/blas.hpp" -#include "host/util.hpp" - -namespace GauXC::integrator::host { - -template -void local_work_replicated_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCHostData & host_data, - std::vector< XCTask >& tasks, - const F* P, - F* VXC, - F* exc, - F* n_el -) { - - const int32_t nbf = basis.nbf(); - - auto task_comparator = []( const XCTask& a, const XCTask& b ) { - return (a.points.size() * a.nbe) > (b.points.size() * b.nbe); - }; - std::sort( tasks.begin(), tasks.end(), task_comparator ); - - - if( not state.modified_weights_are_stored ) - partition_weights_host( weight_alg, mol, meta, tasks ); - - - std::fill( VXC, VXC + size_t(nbf)*nbf, F(0.) ); - *exc = 0.; - - size_t ntasks = tasks.size(); - for( size_t iT = 0; iT < ntasks; ++iT ) { - - auto& task = tasks[iT]; - - const int32_t npts = task.points.size(); - const int32_t nbe = task.nbe; - const int32_t nshells = task.shell_list.size(); - - const F* points = task.points.data()->data(); - const F* weights = task.weights.data(); - const int32_t* shell_list = task.shell_list.data(); - - F* basis_eval = host_data.basis_eval.data(); - F* den_eval = host_data.den_scr.data(); - F* nbe_scr = host_data.nbe_scr.data(); - F* zmat = host_data.zmat.data(); - - F* eps = host_data.eps.data(); - F* gamma = host_data.gamma.data(); - F* vrho = host_data.vrho.data(); - F* vgamma = host_data.vgamma.data(); - - F* dbasis_x_eval = nullptr; - F* dbasis_y_eval = nullptr; - F* dbasis_z_eval = nullptr; - F* dden_x_eval = nullptr; - F* dden_y_eval = nullptr; - F* dden_z_eval = nullptr; - - if( n_deriv > 0 ) { - dbasis_x_eval = basis_eval + npts * nbe; - dbasis_y_eval = dbasis_x_eval + npts * nbe; - dbasis_z_eval = dbasis_y_eval + npts * nbe; - dden_x_eval = den_eval + npts; - dden_y_eval = dden_x_eval + npts; - dden_z_eval = dden_y_eval + npts; - } - - - // Get the submatrix map for batch - auto [submat_map, foo] = gen_compressed_submat_map( basis, task.shell_list, nbf, nbf); - - - // Evaluate Collocation Matrix - if( n_deriv == 1 ) - eval_collocation_deriv1( npts, nshells, nbe, points, basis, shell_list, - basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval ); - else - eval_collocation( npts, nshells, nbe, points, basis, shell_list, basis_eval ); - - - // Extrat Submatrix - const F* den_ptr_use = P; - if( nbe != nbf ) { - detail::submat_set( nbf, nbf, nbe, nbe, P, nbf, nbe_scr, nbe, submat_map ); - den_ptr_use = nbe_scr; - } - - // Z = P * BF - GauXC::blas::gemm( 'N', 'N', nbe, npts, nbe, 1., den_ptr_use, nbe, - basis_eval, nbe, 0., zmat, nbe ); - - - // Evaluate the density - for( int32_t i = 0; i < npts; ++i ) { - - const size_t ioff = size_t(i) * nbe; - const F* zmat_i = zmat + ioff; - - den_eval[i] = - 2. * GauXC::blas::dot( nbe, basis_eval + ioff, 1, zmat_i, 1 ); - - if( n_deriv > 0 ) { - const F dx = - 4. * GauXC::blas::dot( nbe, dbasis_x_eval + ioff, 1, zmat_i, 1 ); - const F dy = - 4. * GauXC::blas::dot( nbe, dbasis_y_eval + ioff, 1, zmat_i, 1 ); - const F dz = - 4. * GauXC::blas::dot( nbe, dbasis_z_eval + ioff, 1, zmat_i, 1 ); - - dden_x_eval[i] = dx; - dden_y_eval[i] = dy; - dden_z_eval[i] = dz; - - gamma[i] = dx*dx + dy*dy + dz*dz; - } - - } - - - // Evaluate XC functional - if( func.is_gga() ) - func.eval_exc_vxc( npts, den_eval, gamma, eps, vrho, vgamma ); - else - func.eval_exc_vxc( npts, den_eval, eps, vrho ); - - - // Factor weights into XC results - for( int32_t i = 0; i < npts; ++i ) { - eps[i] *= weights[i]; - vrho[i] *= weights[i]; - } - - if( func.is_gga() ) - for( int32_t i = 0; i < npts; ++i ) vgamma[i] *= weights[i]; - - - - // Scalar integrations - if( n_el ) - for( int32_t i = 0; i < npts; ++i ) *n_el += weights[i] * den_eval[i]; - - for( int32_t i = 0; i < npts; ++i ) *exc += eps[i] * den_eval[i]; - - - // Assemble Z - if( func.is_gga() ) - zmat_gga_host( npts, nbe, vrho, vgamma, basis_eval, dbasis_x_eval, - dbasis_y_eval, dbasis_z_eval, dden_x_eval, dden_y_eval, - dden_z_eval, zmat ); - else - zmat_lda_host( npts, nbe, vrho, basis_eval, zmat ); - - - - // Update VXC XXX: Only LT - GauXC::blas::syr2k( 'L', 'N', nbe, npts, F(1.), basis_eval, - nbe, zmat, nbe, F(0.), nbe_scr, nbe ); - - - detail::inc_by_submat( nbf, nbf, nbe, nbe, VXC, nbf, nbe_scr, nbe, - submat_map ); - } - - // Symmetrize VXC - for( int32_t j = 0; j < nbf; ++j ) - for( int32_t i = j+1; i < nbf; ++i ) - VXC[ j + i*nbf ] = VXC[ i + j*nbf ]; - - -} - -#define HOST_IMPL( F, ND ) \ -template \ -void local_work_replicated_exc_vxc_impl(\ - XCWeightAlg weight_alg,\ - XCIntegratorState state,\ - const functional_type& func,\ - const BasisSet& basis,\ - const Molecule & mol,\ - const MolMeta & meta,\ - XCHostData & host_data,\ - std::vector< XCTask >& local_work,\ - const F* P,\ - F* VXC,\ - F* exc,\ - F* n_el\ -) - -HOST_IMPL( double, 0 ); -HOST_IMPL( double, 1 ); - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/local_work_replicated_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/host/local_work_replicated_exc_vxc.hpp deleted file mode 100644 index f5a7265..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/local_work_replicated_exc_vxc.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - - -#include -#include -#include -#include -#include - -#include - -#include "host/xc_host_data.hpp" - -namespace GauXC::integrator::host { - -template -void local_work_replicated_exc_vxc_impl( - XCWeightAlg weight_alg, - XCIntegratorState state, - const functional_type& func, - const BasisSet& basis, - const Molecule & mol, - const MolMeta & meta, - XCHostData & host_data, - std::vector< XCTask >& tasks, - const F* P, - F* VXC, - F* exc, - F* n_el -); - -template -inline void local_work_replicated_exc_vxc( size_t n_deriv, Args&&... args ) { - if( n_deriv == 0 ) - local_work_replicated_exc_vxc_impl( std::forward(args)... ); - else if( n_deriv == 1 ) - local_work_replicated_exc_vxc_impl( std::forward(args)... ); - else - throw std::runtime_error("MGGA NYI"); -} - - -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/reference_xc_host_exc_vxc.hpp b/third_party/gauxc/attic/src/new_integrator/host/reference_xc_host_exc_vxc.hpp deleted file mode 100644 index b21abaa..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/reference_xc_host_exc_vxc.hpp +++ /dev/null @@ -1,102 +0,0 @@ -#include - -#include "host/xc_host_data.hpp" -#include "host/local_work_replicated_exc_vxc.hpp" - -namespace GauXC { -namespace detail { - -template -void ReferenceXCHostIntegrator:: - eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) { - - size_t nbf = this->basis_->nbf(); - - //// TODO: Check that P is sane - - - auto& tasks = this->load_balancer_->get_tasks(); - - size_t max_npts = this->load_balancer_->max_npts(); - size_t max_nbe = this->load_balancer_->max_nbe(); - size_t max_npts_x_nbe = this->load_balancer_->max_npts_x_nbe(); - - size_t n_deriv = this->func_->is_gga() ? 1 : 0; - - // Allocate Memory - auto host_data = this->timer_.time_op("XCIntegrator.HostAlloc", - [&](){ - return std::make_shared>( - n_deriv, nbf, max_npts, max_npts_x_nbe - ); - }); - - - value_type N_EL; - - // Compute Local contributions to EXC / VXC - this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - GauXC::integrator::host::local_work_replicated_exc_vxc< value_type >( - n_deriv, XCWeightAlg::SSF, state_, *this->func_, - *this->basis_, this->load_balancer_->molecule(), - this->load_balancer_->molmeta(), *host_data, tasks, P, - VXC, EXC, &N_EL - ); - }); - - // Update State of Integrator - state_.load_balancer_populated = true; - state_.modified_weights_are_stored = true; - - -#ifdef GAUXC_ENABLE_MPI - - int world_size; - MPI_Comm_size( this->comm_, &world_size ); - - if( world_size > 1 ) { - - this->timer_.time_op("XCIntegrator.Allreduce", [&](){ - // Test of communicator is an inter-communicator - // XXX: Can't think of a case when this would be true, but who knows... - int inter_flag; - MPI_Comm_test_inter( this->comm_, &inter_flag ); - - // Is Intra-communicator, Allreduce can be done inplace - if( not inter_flag ) { - - MPI_Allreduce( MPI_IN_PLACE, VXC, nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( MPI_IN_PLACE, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - // Isn't Intra-communicator (weird), Allreduce can't be done inplace - } else { - - std::allocator alloc; - auto VXC_cpy = alloc.allocate( nbf*nbf ); - value_type EXC_cpy = *EXC, N_EL_cpy = N_EL; - - MPI_Allreduce( VXC_cpy, VXC, nbf*nbf, MPI_DOUBLE, - MPI_SUM, this->comm_ ); - MPI_Allreduce( &EXC_cpy, EXC, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - MPI_Allreduce( &N_EL_cpy, &N_EL, 1, MPI_DOUBLE, MPI_SUM, this->comm_ ); - - - } - }); - - } - -#endif - - - - -} - -} -} - diff --git a/third_party/gauxc/attic/src/new_integrator/host/reference_xc_host_integrator.cxx b/third_party/gauxc/attic/src/new_integrator/host/reference_xc_host_integrator.cxx deleted file mode 100644 index bf005b0..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/reference_xc_host_integrator.cxx +++ /dev/null @@ -1,27 +0,0 @@ -#include - -#include "host/reference_xc_host_exc_vxc.hpp" - -namespace GauXC { -namespace detail { - -template -ReferenceXCHostIntegrator:: - ReferenceXCHostIntegrator( const ReferenceXCHostIntegrator& ) = default; - -template -ReferenceXCHostIntegrator:: - ReferenceXCHostIntegrator( ReferenceXCHostIntegrator&& ) noexcept = default; - -template -ReferenceXCHostIntegrator:: - ~ReferenceXCHostIntegrator() noexcept = default; - - - - - -template class ReferenceXCHostIntegrator; - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/util.hpp b/third_party/gauxc/attic/src/new_integrator/host/util.hpp deleted file mode 100644 index b23f66f..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/util.hpp +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once -#include "host/blas.hpp" -#include -#include -#include - -namespace GauXC { -namespace detail { - -template -void submat_set(int32_t M, int32_t N, int32_t MSub, - int32_t NSub, _F1 *ABig, int32_t LDAB, _F2 *ASmall, - int32_t LDAS, - std::vector> &submat_map) { - - (void)(M); - (void)(N); - (void)(MSub); - (void)(NSub); - - int32_t i(0); - for( auto& iCut : submat_map ) { - int32_t deltaI = iCut[1]; - int32_t j(0); - for( auto& jCut : submat_map ) { - int32_t deltaJ = jCut[1]; - - auto* ABig_use = ABig + iCut[0] + jCut[0] * LDAB; - auto* ASmall_use = ASmall + i + j * LDAS; - - - GauXC::blas::lacpy( 'A', deltaI, deltaJ, ABig_use, LDAB, - ASmall_use, LDAS ); - - - j += deltaJ; - } - i += deltaI; - } - - -} - -template -void inc_by_submat(int32_t M, int32_t N, int32_t MSub, - int32_t NSub, _F1 *ABig, int32_t LDAB, _F2 *ASmall, - int32_t LDAS, - std::vector> &submat_map) { - - (void)(M); - (void)(N); - (void)(MSub); - (void)(NSub); - - int32_t i(0); - for( auto& iCut : submat_map ) { - int32_t deltaI = iCut[1]; - int32_t j(0); - for( auto& jCut : submat_map ) { - int32_t deltaJ = jCut[1]; - - auto* ABig_use = ABig + iCut[0] + jCut[0] * LDAB; - auto* ASmall_use = ASmall + i + j * LDAS; - - - for( int32_t jj = 0; jj < deltaJ; ++jj ) - for( int32_t ii = 0; ii < deltaI; ++ii ) - ABig_use[ ii + jj * LDAB ] += ASmall_use[ ii + jj * LDAS ]; - - - j += deltaJ; - } - i += deltaI; - } - - -} - -} -} diff --git a/third_party/gauxc/attic/src/new_integrator/host/xc_host_data.hpp b/third_party/gauxc/attic/src/new_integrator/host/xc_host_data.hpp deleted file mode 100644 index f2a51c8..0000000 --- a/third_party/gauxc/attic/src/new_integrator/host/xc_host_data.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once -#include -#include - -#include - -namespace GauXC { - -template -struct XCHostData { - - std::vector eps; - std::vector gamma; - std::vector vrho; - std::vector vgamma; - - std::vector zmat; - std::vector nbe_scr; - std::vector den_scr; - std::vector basis_eval; - - - XCHostData( size_t n_deriv, - size_t nbf, - size_t max_npts, - size_t max_npts_x_nbe ) : - eps( max_npts ), - gamma( (n_deriv > 0) * max_npts ), - vrho( max_npts ), - vgamma( (n_deriv > 0) * max_npts ), - zmat( max_npts_x_nbe ), - nbe_scr( nbf * nbf ), - den_scr( (3*n_deriv + 1) * max_npts ), - basis_eval( (3*n_deriv + 1) * max_npts_x_nbe ) { } - - -}; - -} diff --git a/third_party/gauxc/attic/src/new_integrator/replicated/gauxc-replicated.cmake b/third_party/gauxc/attic/src/new_integrator/replicated/gauxc-replicated.cmake deleted file mode 100644 index 1ec274f..0000000 --- a/third_party/gauxc/attic/src/new_integrator/replicated/gauxc-replicated.cmake +++ /dev/null @@ -1,7 +0,0 @@ -# Implementations of generic interfaces -target_sources( gauxc PRIVATE replicated/replicated_xc_integrator_impl.cxx ) - -if( GAUXC_ENABLE_HOST ) - target_sources( gauxc PRIVATE replicated/reference_xc_host_integrator.cxx ) -endif() - diff --git a/third_party/gauxc/attic/src/new_integrator/replicated_xc_integrator_impl.cxx b/third_party/gauxc/attic/src/new_integrator/replicated_xc_integrator_impl.cxx deleted file mode 100644 index aacbed3..0000000 --- a/third_party/gauxc/attic/src/new_integrator/replicated_xc_integrator_impl.cxx +++ /dev/null @@ -1,45 +0,0 @@ -#include - -namespace GauXC { -namespace detail { - -#ifdef GAUXC_ENABLE_MPI - -template -ReplicatedXCIntegratorImpl:: - ReplicatedXCIntegratorImpl( MPI_Comm comm, - std::shared_ptr< functional_type > func, - std::shared_ptr< basis_type > basis, - std::shared_ptr< LoadBalancer > lb ) : - comm_(comm), func_(func), basis_(basis), load_balancer_(lb) { } - -#else - -template -ReplicatedXCIntegratorImpl:: - ReplicatedXCIntegratorImpl( std::shared_ptr< functional_type > func, - std::shared_ptr< basis_type > basis, - std::shared_ptr< LoadBalancer > lb ) : - func_(func), basis_(basis), load_balancer_(lb) { } - -#endif - -template -ReplicatedXCIntegratorImpl:: - ~ReplicatedXCIntegratorImpl() noexcept = default; - - -template -void ReplicatedXCIntegratorImpl:: - eval_exc_vxc( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* VXC, int64_t ldvxc, - value_type* EXC ) { - - eval_exc_vxc_(m,n,P,ldp,VXC,ldvxc,EXC); - -} - -template class ReplicatedXCIntegratorImpl; - -} -} diff --git a/third_party/gauxc/attic/tests/collocation_cuda.hpp b/third_party/gauxc/attic/tests/collocation_cuda.hpp deleted file mode 100644 index 9c654e2..0000000 --- a/third_party/gauxc/attic/tests/collocation_cuda.hpp +++ /dev/null @@ -1,725 +0,0 @@ -#ifdef GAUXC_ENABLE_CUDA -#include "collocation_common.hpp" -#include "exceptions/cuda_exception.hpp" -#include -#include "device/cuda/collocation_device.hpp" - - - - -void test_cuda_collocation_petite( const BasisSet& basis, std::ifstream& in_file) { - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - auto shells_device = util::cuda_malloc>( basis.size() ); - auto offs_device = util::cuda_malloc( basis.size() ); - auto pts_device = util::cuda_malloc( 3 * MAX_NPTS_CHECK ); - auto eval_device = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - - - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - std::vector> shells; - for( auto idx : mask ) shells.emplace_back(basis[idx]); - util::cuda_copy( shells.size(), shells_device, shells.data() ); - - integrator::cuda::eval_collocation_petite( shells.size(), nbf, npts, - shells_device, offs_device, - pts_device, - eval_device, stream ); - - std::vector eval( nbf * npts ); - - util::cuda_copy( nbf * npts, eval.data(), eval_device ); - - check_collocation_transpose( npts, nbf, d.eval.data(), eval.data() ); - - } - util::cuda_device_sync(); - util::cuda_free(shells_device, offs_device, pts_device, eval_device ); - -} - - - - -void test_cuda_collocation_masked( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - auto shells_device = util::cuda_malloc>( basis.size() ); - auto offs_device = util::cuda_malloc( basis.size() ); - auto mask_device = util::cuda_malloc( basis.size() ); - auto pts_device = util::cuda_malloc( 3 * MAX_NPTS_CHECK ); - auto eval_device = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - - - std::vector> shells( basis ); - util::cuda_copy( basis.size(), shells_device, shells.data() ); - - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector mask_ul( mask.size() ); - std::copy( mask.begin(), mask.end(), mask_ul.begin() ); - util::cuda_copy( mask.size(), mask_device, mask_ul.data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - - integrator::cuda::eval_collocation_masked( mask.size(), nbf, npts, - shells_device, mask_device, - offs_device, pts_device, - eval_device, stream ); - - std::vector eval( nbf * npts ); - - util::cuda_copy( nbf * npts, eval.data(), eval_device ); - - check_collocation_transpose( npts, nbf, d.eval.data(), eval.data() ); - - } - util::cuda_device_sync(); - util::cuda_free(shells_device, offs_device, pts_device, eval_device ); - -} - - - - - - - - - - - - - - - -void test_cuda_collocation_petite_combined( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - - std::vector< cuda::XCTaskDevice > tasks; - - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - /// XXX: THIS DOES NOT POPULATE A VALID TASK, ONLY WHAT's REQUIRED FOR THIS - // TEST - auto& task = tasks.emplace_back(); - task.nbe = nbf; - task.npts = npts; - task.nshells = mask.size(); - - task.points = util::cuda_malloc( 3 * npts ); - task.shell_offs = util::cuda_malloc( mask.size() ); - task.shells = util::cuda_malloc>(mask.size()); - task.bf = util::cuda_malloc( nbf * npts ); - - auto* pts_device = task.points; - auto* offs_device = task.shell_offs; - auto* shells_device = task.shells; - - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - std::vector> shells; - for( auto idx : mask ) shells.emplace_back(basis[idx]); - util::cuda_copy( shells.size(), shells_device, shells.data() ); - - - } - - - const auto nshells_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.nshells < b.nshells; - })->nshells; - - const auto npts_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.npts < b.npts; - })->npts; - - auto* tasks_device = util::cuda_malloc>( tasks.size() ); - util::cuda_copy( tasks.size(), tasks_device, tasks.data() ); - - integrator::cuda::eval_collocation_petite_combined( tasks.size(), npts_max, - nshells_max, tasks_device, stream ); - - util::cuda_device_sync(); - - - for( int i = 0; i < tasks.size(); i++ ) { - - auto* ref_eval = ref_data[i].eval.data(); - std::vector eval (tasks[i].nbe * tasks[i].npts); - util::cuda_copy( eval.size(), eval.data(), tasks[i].bf ); - - check_collocation_transpose( tasks[i].npts, tasks[i].nbe, ref_eval, eval.data() ); - } - - - for( auto& t : tasks ) { - util::cuda_free( t.points, t.shell_offs, t.shells, t.bf ); - } - util::cuda_free( tasks_device ); -} - - -void test_cuda_collocation_masked_combined( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - - std::vector< cuda::XCTaskDevice > tasks; - - auto shells_device = util::cuda_malloc>( basis.size() ); - std::vector> shells( basis ); - util::cuda_copy( basis.size(), shells_device, shells.data() ); - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - /// XXX: THIS DOES NOT POPULATE A VALID TASK, ONLY WHAT's REQUIRED FOR THIS - // TEST - auto& task = tasks.emplace_back(); - task.nbe = nbf; - task.npts = npts; - task.nshells = mask.size(); - - task.points = util::cuda_malloc( 3 * npts ); - task.shell_offs = util::cuda_malloc( mask.size() ); - task.shell_list = util::cuda_malloc( mask.size() ); - task.bf = util::cuda_malloc( nbf * npts ); - - auto* pts_device = task.points; - auto* offs_device = task.shell_offs; - auto* mask_device = task.shell_list; - - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector mask_ul( mask.size() ); - std::copy( mask.begin(), mask.end(), mask_ul.begin() ); - util::cuda_copy( mask.size(), mask_device, mask_ul.data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - - } - - - const auto nshells_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.nshells < b.nshells; - })->nshells; - - const auto npts_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.npts < b.npts; - })->npts; - - auto* tasks_device = util::cuda_malloc>( tasks.size() ); - util::cuda_copy( tasks.size(), tasks_device, tasks.data() ); - - integrator::cuda::eval_collocation_masked_combined( tasks.size(), npts_max, - nshells_max, shells_device, tasks_device, stream ); - - util::cuda_device_sync(); - - - for( int i = 0; i < tasks.size(); i++ ) { - - auto* ref_eval = ref_data[i].eval.data(); - std::vector eval (tasks[i].nbe * tasks[i].npts); - util::cuda_copy( eval.size(), eval.data(), tasks[i].bf ); - - check_collocation_transpose( tasks[i].npts, tasks[i].nbe, ref_eval, eval.data() ); - } - - - for( auto& t : tasks ) { - util::cuda_free( t.points, t.shell_offs, t.shell_list, t.bf ); - } - util::cuda_free( tasks_device, shells_device ); -} - - - - - - - - - - - - - - - - - - - -void test_cuda_collocation_deriv1_petite( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - auto shells_device = util::cuda_malloc>( basis.size() ); - auto offs_device = util::cuda_malloc( basis.size() ); - auto pts_device = util::cuda_malloc( 3 * MAX_NPTS_CHECK ); - auto eval_device = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - auto deval_device_x = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - auto deval_device_y = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - auto deval_device_z = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - - - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - std::vector> shells; - for( auto idx : mask ) shells.emplace_back(basis[idx]); - util::cuda_copy( shells.size(), shells_device, shells.data() ); - - integrator::cuda::eval_collocation_petite_deriv1( shells.size(), nbf, npts, - shells_device, offs_device, - pts_device, - eval_device, deval_device_x, - deval_device_y, deval_device_z, - stream ); - - std::vector eval ( nbf * npts ), - deval_x( nbf * npts ), - deval_y( nbf * npts ), - deval_z( nbf * npts ); - - util::cuda_copy( nbf * npts, eval.data(), eval_device ); - util::cuda_copy( nbf * npts, deval_x.data(), deval_device_x ); - util::cuda_copy( nbf * npts, deval_y.data(), deval_device_y ); - util::cuda_copy( nbf * npts, deval_z.data(), deval_device_z ); - - check_collocation_transpose( npts, nbf, d.eval.data(), eval.data() ); - check_collocation_transpose( npts, nbf, d.deval_x.data(), deval_x.data() ); - check_collocation_transpose( npts, nbf, d.deval_y.data(), deval_y.data() ); - check_collocation_transpose( npts, nbf, d.deval_z.data(), deval_z.data() ); - - } - util::cuda_device_sync(); - util::cuda_free(shells_device, offs_device, pts_device, eval_device, - deval_device_x, deval_device_y, deval_device_z); -} - - - - -void test_cuda_collocation_deriv1_masked( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - auto shells_device = util::cuda_malloc>( basis.size() ); - auto offs_device = util::cuda_malloc( basis.size() ); - auto mask_device = util::cuda_malloc( basis.size() ); - auto pts_device = util::cuda_malloc( 3 * MAX_NPTS_CHECK ); - auto eval_device = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - auto deval_device_x = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - auto deval_device_y = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - auto deval_device_z = util::cuda_malloc( basis.nbf() * MAX_NPTS_CHECK ); - - - std::vector> shells( basis ); - util::cuda_copy( basis.size(), shells_device, shells.data() ); - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector mask_ul( mask.size() ); - std::copy( mask.begin(), mask.end(), mask_ul.begin() ); - util::cuda_copy( mask.size(), mask_device, mask_ul.data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - - integrator::cuda::eval_collocation_masked_deriv1( mask.size(), nbf, npts, - shells_device, mask_device, - offs_device, pts_device, - eval_device, deval_device_x, - deval_device_y, deval_device_z, - stream ); - - std::vector eval ( nbf * npts ), - deval_x( nbf * npts ), - deval_y( nbf * npts ), - deval_z( nbf * npts ); - - util::cuda_copy( nbf * npts, eval.data(), eval_device ); - util::cuda_copy( nbf * npts, deval_x.data(), deval_device_x ); - util::cuda_copy( nbf * npts, deval_y.data(), deval_device_y ); - util::cuda_copy( nbf * npts, deval_z.data(), deval_device_z ); - - check_collocation_transpose( npts, nbf, d.eval.data(), eval.data() ); - check_collocation_transpose( npts, nbf, d.deval_x.data(), deval_x.data() ); - check_collocation_transpose( npts, nbf, d.deval_y.data(), deval_y.data() ); - check_collocation_transpose( npts, nbf, d.deval_z.data(), deval_z.data() ); - - } - util::cuda_device_sync(); - util::cuda_free(shells_device, offs_device, pts_device, eval_device, - deval_device_x, deval_device_y, deval_device_z); -} - - - - - - - -void test_cuda_collocation_petite_combined_deriv1( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - - std::vector< cuda::XCTaskDevice > tasks; - - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - /// XXX: THIS DOES NOT POPULATE A VALID TASK, ONLY WHAT's REQUIRED FOR THIS - // TEST - auto& task = tasks.emplace_back(); - task.nbe = nbf; - task.npts = npts; - task.nshells = mask.size(); - - task.points = util::cuda_malloc( 3 * npts ); - task.shell_offs = util::cuda_malloc( mask.size() ); - task.shells = util::cuda_malloc>(mask.size()); - task.bf = util::cuda_malloc( nbf * npts ); - task.dbfx = util::cuda_malloc( nbf * npts ); - task.dbfy = util::cuda_malloc( nbf * npts ); - task.dbfz = util::cuda_malloc( nbf * npts ); - - auto* pts_device = task.points; - auto* offs_device = task.shell_offs; - auto* shells_device = task.shells; - - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - std::vector> shells; - for( auto idx : mask ) shells.emplace_back(basis[idx]); - util::cuda_copy( shells.size(), shells_device, shells.data() ); - - - } - - - const auto nshells_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.nshells < b.nshells; - })->nshells; - - const auto npts_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.npts < b.npts; - })->npts; - - auto* tasks_device = util::cuda_malloc>( tasks.size() ); - util::cuda_copy( tasks.size(), tasks_device, tasks.data() ); - - integrator::cuda::eval_collocation_petite_combined_deriv1( tasks.size(), npts_max, - nshells_max, tasks_device, stream ); - - util::cuda_device_sync(); - - - for( int i = 0; i < tasks.size(); i++ ) { - - auto* ref_eval = ref_data[i].eval.data(); - auto* ref_deval_x = ref_data[i].deval_x.data(); - auto* ref_deval_y = ref_data[i].deval_y.data(); - auto* ref_deval_z = ref_data[i].deval_z.data(); - - std::vector eval (tasks[i].nbe * tasks[i].npts); - std::vector deval_x (tasks[i].nbe * tasks[i].npts); - std::vector deval_y (tasks[i].nbe * tasks[i].npts); - std::vector deval_z (tasks[i].nbe * tasks[i].npts); - - util::cuda_copy( eval.size(), eval.data(), tasks[i].bf ); - util::cuda_copy( eval.size(), deval_x.data(), tasks[i].dbfx ); - util::cuda_copy( eval.size(), deval_y.data(), tasks[i].dbfy ); - util::cuda_copy( eval.size(), deval_z.data(), tasks[i].dbfz ); - - - auto npts = tasks[i].npts; - auto nbe = tasks[i].nbe; - check_collocation_transpose( npts, nbe, ref_eval, eval.data() ); - check_collocation_transpose( npts, nbe, ref_deval_x, deval_x.data() ); - check_collocation_transpose( npts, nbe, ref_deval_y, deval_y.data() ); - check_collocation_transpose( npts, nbe, ref_deval_z, deval_z.data() ); - } - - - for( auto& t : tasks ) { - util::cuda_free( t.points, t.shell_offs, t.shells, t.bf, t.dbfx, t.dbfy, - t.dbfz ); - } - util::cuda_free( tasks_device ); -} - - -void test_cuda_collocation_masked_combined_deriv1( const BasisSet& basis, std::ifstream& in_file) { - - - - std::vector ref_data; - - { - cereal::BinaryInputArchive ar( in_file ); - ar( ref_data ); - } - - - std::vector< cuda::XCTaskDevice > tasks; - - auto shells_device = util::cuda_malloc>( basis.size() ); - std::vector> shells( basis ); - util::cuda_copy( basis.size(), shells_device, shells.data() ); - - cudaStream_t stream = 0; - for( auto& d : ref_data ) { - const auto npts = d.pts.size(); - const auto nbf = d.eval.size() / npts; - - const auto& mask = d.mask; - const auto& pts = d.pts; - - /// XXX: THIS DOES NOT POPULATE A VALID TASK, ONLY WHAT's REQUIRED FOR THIS - // TEST - auto& task = tasks.emplace_back(); - task.nbe = nbf; - task.npts = npts; - task.nshells = mask.size(); - - task.points = util::cuda_malloc( 3 * npts ); - task.shell_offs = util::cuda_malloc( mask.size() ); - task.shell_list = util::cuda_malloc( mask.size() ); - task.bf = util::cuda_malloc( nbf * npts ); - task.dbfx = util::cuda_malloc( nbf * npts ); - task.dbfy = util::cuda_malloc( nbf * npts ); - task.dbfz = util::cuda_malloc( nbf * npts ); - - - auto* pts_device = task.points; - auto* offs_device = task.shell_offs; - auto* mask_device = task.shell_list; - - - util::cuda_copy( 3*npts, pts_device, pts.data()->data() ); - - std::vector mask_ul( mask.size() ); - std::copy( mask.begin(), mask.end(), mask_ul.begin() ); - util::cuda_copy( mask.size(), mask_device, mask_ul.data() ); - - std::vector offs( mask.size() ); - offs[0] = 0; - for( int i = 1; i < mask.size(); ++i ) - offs[i] = offs[i-1] + basis[mask[i-1]].size(); - util::cuda_copy( offs.size(), offs_device, offs.data() ); - - - } - - - const auto nshells_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.nshells < b.nshells; - })->nshells; - - const auto npts_max = std::max_element( tasks.begin(), tasks.end(), - []( const auto& a, const auto& b ) { - return a.npts < b.npts; - })->npts; - - auto* tasks_device = util::cuda_malloc>( tasks.size() ); - util::cuda_copy( tasks.size(), tasks_device, tasks.data() ); - - integrator::cuda::eval_collocation_masked_combined_deriv1( tasks.size(), npts_max, - nshells_max, shells_device, tasks_device, stream ); - - util::cuda_device_sync(); - - - for( int i = 0; i < tasks.size(); i++ ) { - - auto* ref_eval = ref_data[i].eval.data(); - auto* ref_deval_x = ref_data[i].deval_x.data(); - auto* ref_deval_y = ref_data[i].deval_y.data(); - auto* ref_deval_z = ref_data[i].deval_z.data(); - - std::vector eval (tasks[i].nbe * tasks[i].npts); - std::vector deval_x (tasks[i].nbe * tasks[i].npts); - std::vector deval_y (tasks[i].nbe * tasks[i].npts); - std::vector deval_z (tasks[i].nbe * tasks[i].npts); - - util::cuda_copy( eval.size(), eval.data(), tasks[i].bf ); - util::cuda_copy( eval.size(), deval_x.data(), tasks[i].dbfx ); - util::cuda_copy( eval.size(), deval_y.data(), tasks[i].dbfy ); - util::cuda_copy( eval.size(), deval_z.data(), tasks[i].dbfz ); - - - auto npts = tasks[i].npts; - auto nbe = tasks[i].nbe; - check_collocation_transpose( npts, nbe, ref_eval, eval.data() ); - check_collocation_transpose( npts, nbe, ref_deval_x, deval_x.data() ); - check_collocation_transpose( npts, nbe, ref_deval_y, deval_y.data() ); - check_collocation_transpose( npts, nbe, ref_deval_z, deval_z.data() ); - } - - - for( auto& t : tasks ) { - util::cuda_free( t.points, t.shell_offs, t.shell_list, t.bf, t.dbfx, t.dbfy, - t.dbfz ); - } - util::cuda_free( tasks_device, shells_device ); -} -#endif // GAUXC_ENABLE_SYCL - diff --git a/third_party/gauxc/cmake/BuildFindCereal.cmake b/third_party/gauxc/cmake/BuildFindCereal.cmake deleted file mode 100644 index f6787d4..0000000 --- a/third_party/gauxc/cmake/BuildFindCereal.cmake +++ /dev/null @@ -1,32 +0,0 @@ -find_package( cereal QUIET ) -if( NOT cereal_FOUND ) - - include( gauxc-dep-versions ) - - message( STATUS "Could not find Cereal... Building" ) - message( STATUS "CEREAL REPO = ${GAUXC_CEREAL_REPOSITORY}" ) - message( STATUS "CEREAL REV = ${GAUXC_CEREAL_REVISION}" ) - - FetchContent_Declare( - cereal - GIT_REPOSITORY ${GAUXC_CEREAL_REPOSITORY} - GIT_TAG ${GAUXC_CEREAL_REVISION} - ) - - FetchContent_GetProperties(cereal) - if(NOT cereal_POPULATED) - FetchContent_Populate( cereal ) - add_library( cereal INTERFACE IMPORTED ) - set_target_properties( cereal PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${cereal_SOURCE_DIR}/include" - INTERFACE_COMPILE_DEFINITIONS "CEREAL_THREAD_SAFE=1;GAUXC_HAS_CEREAL=1" - ) - endif() - -else() - - target_compile_definitions( cereal INTERFACE - "CEREAL_THREAD_SAFE=1;GAUXC_HAS_CEREAL=1" - ) - -endif() diff --git a/third_party/gauxc/cmake/gauxc-cereal.cmake b/third_party/gauxc/cmake/gauxc-cereal.cmake deleted file mode 100644 index 5ddbc3b..0000000 --- a/third_party/gauxc/cmake/gauxc-cereal.cmake +++ /dev/null @@ -1 +0,0 @@ -include( BuildFindCereal ) diff --git a/third_party/gauxc/cmake/gauxc-config.cmake.in b/third_party/gauxc/cmake/gauxc-config.cmake.in deleted file mode 100644 index 0a675b9..0000000 --- a/third_party/gauxc/cmake/gauxc-config.cmake.in +++ /dev/null @@ -1,84 +0,0 @@ -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) # Require CMake 3.18+ - -get_filename_component(GauXC_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) - -list(PREPEND CMAKE_MODULE_PATH ${GauXC_CMAKE_DIR} ) -list(PREPEND CMAKE_MODULE_PATH ${GauXC_CMAKE_DIR}/linalg-cmake-modules ) -include(CMakeFindDependencyMacro) - -# Always Required Dependencies -find_dependency( ExchCXX ) -find_dependency( IntegratorXX ) - -set( GAUXC_HAS_HOST @GAUXC_HAS_HOST@ ) -set( GAUXC_HAS_CUDA @GAUXC_HAS_CUDA@ ) -set( GAUXC_HAS_HIP @GAUXC_HAS_HIP@ ) -set( GAUXC_HAS_MAGMA @GAUXC_HAS_MAGMA@ ) -set( GAUXC_HAS_NCCL @GAUXC_HAS_NCCL@ ) -set( GAUXC_HAS_CUTLASS @GAUXC_HAS_CUTLASS@ ) -set( GAUXC_HAS_MPI @GAUXC_HAS_MPI@ ) -set( GAUXC_HAS_OPENMP @GAUXC_HAS_OPENMP@ ) -set( GAUXC_HAS_GAU2GRID @GAUXC_HAS_GAU2GRID@ ) -set( GAUXC_HAS_HDF5 @GAUXC_HAS_HDF5@ ) -set( GAUXC_BLAS_IS_LP64 @GAUXC_BLAS_IS_LP64@ ) -set( GAUXC_HAS_ONEDFT @GAUXC_HAS_ONEDFT@ ) - -# Make sure C / CXX are enabled (former for BLAS discovery) -enable_language(C) -enable_language(CXX) - -if(GAUXC_HAS_OPENMP) - find_dependency( OpenMP ) -endif() - -if( GAUXC_HAS_HOST ) - if(GAUXC_BLAS_IS_LP64) - set( _blas_components lp64 ) - else() - set( _blas_components ilp64 ) - endif() - find_dependency( BLAS COMPONENTS "${_blas_components}") - unset( _blas_components ) -endif() - -if( GAUXC_HAS_CUDA ) - enable_language( CUDA ) - find_dependency( CUDAToolkit @CUDAToolkit_VERSION@ EXACT ) - if( GAUXC_HAS_MAGMA ) - find_dependency( MAGMA ) - endif() - if( GAUXC_HAS_NCCL ) - find_dependency( NCCL ) - endif() -endif() - -if( GAUXC_HAS_MPI ) - find_dependency( MPI ) -endif() - -if( GAUXC_HAS_OPENMP ) - find_dependency( OpenMP ) -endif() - -if( GAUXC_HAS_HDF5 ) - find_dependency( HighFive ) -endif() - -if ( GAUXC_HAS_ONEDFT ) - set(_PREV_CUDA_ARCHS "${CMAKE_CUDA_ARCHITECTURES}") - find_dependency ( Torch ) - if(CMAKE_CUDA_ARCHITECTURES STREQUAL "OFF") - set(CMAKE_CUDA_ARCHITECTURES "${_PREV_CUDA_ARCHS}" CACHE STRING "Restore CUDA archs after Torch override" FORCE) - message(WARNING "Torch set CMAKE_CUDA_ARCHITECTURES to OFF. Restored previous value: ${CMAKE_CUDA_ARCHITECTURES}") - endif() - find_dependency ( nlohmann_json ) -endif() - -list(REMOVE_AT CMAKE_MODULE_PATH 0) -list(REMOVE_AT CMAKE_MODULE_PATH 0) - -if(NOT TARGET gauxc::gauxc) - include("${GauXC_CMAKE_DIR}/gauxc-targets.cmake") -endif() - -set(GauXC_LIBRARIES gauxc::gauxc) diff --git a/third_party/gauxc/cmake/gauxc-cub.cmake b/third_party/gauxc/cmake/gauxc-cub.cmake deleted file mode 100644 index e1f8990..0000000 --- a/third_party/gauxc/cmake/gauxc-cub.cmake +++ /dev/null @@ -1,31 +0,0 @@ -if( GAUXC_HAS_CUDA ) - - find_package( CUDAToolkit REQUIRED ) - if( CUDAToolkit_VERSION VERSION_LESS "11.0.0" ) - include( gauxc-dep-versions ) - - message( STATUS "Building Local CUB Installation" ) - message( STATUS "CUB REPO = ${GAUXC_CUB_REPOSITORY}" ) - message( STATUS "CUB REV = ${GAUXC_CUB_REVISION}" ) - - FetchContent_Declare( - cub - GIT_REPOSITORY ${GAUXC_CUB_REPOSITORY} - GIT_TAG ${GAUXC_CUB_REVISION} - ) - - FetchContent_GetProperties( cub ) - if( NOT cub_POPULATED ) - FetchContent_Populate( cub ) - endif() - - add_library( gauxc_cub INTERFACE IMPORTED ) - set_target_properties( gauxc_cub PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES ${cub_SOURCE_DIR} - ) - else() - message( STATUS "Using CUB from CUDAToolkit" ) - message( STATUS " CUDATOOLKIT VERSION = ${CUDAToolkit_VERSION}" ) - endif() - -endif() diff --git a/third_party/gauxc/cmake/gauxc-cutlass.cmake b/third_party/gauxc/cmake/gauxc-cutlass.cmake deleted file mode 100644 index 7020eb2..0000000 --- a/third_party/gauxc/cmake/gauxc-cutlass.cmake +++ /dev/null @@ -1,33 +0,0 @@ -# Check that only CUDA CC 8.0+ is enabled -foreach( cuda_arch ${CMAKE_CUDA_ARCHITECTURES} ) - if( NOT cuda_arch GREATER_EQUAL 80 ) - message(FATAL_ERROR "GauXC Requires CUDA CC >= 8.0 For CUTLASS") - endif() -endforeach() - -include( gauxc-dep-versions ) - -message( STATUS "Building Local CUTLASS Installation" ) -message( STATUS "CUTLASS REPO = ${GAUXC_CUTLASS_REPOSITORY}" ) -message( STATUS "CUTLASS REV = ${GAUXC_CUTLASS_REVISION}" ) - -FetchContent_Declare( - cutlass - GIT_REPOSITORY ${GAUXC_CUTLASS_REPOSITORY} - GIT_TAG ${GAUXC_CUTLASS_REVISION} -) - -FetchContent_GetProperties( cutlass ) -if( NOT cutlass_POPULATED ) - FetchContent_Populate( cutlass ) -endif() - - - -add_library( gauxc_cutlass INTERFACE IMPORTED ) -set_target_properties( gauxc_cutlass PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES - "${cutlass_SOURCE_DIR}/include;${cutlass_SOURCE_DIR}/tools/util/include" -) - -set(GAUXC_HAS_CUTLASS TRUE CACHE BOOL "GauXC has CUTLASS" FORCE) diff --git a/third_party/gauxc/cmake/gauxc-dep-versions.cmake b/third_party/gauxc/cmake/gauxc-dep-versions.cmake deleted file mode 100644 index f5d8d78..0000000 --- a/third_party/gauxc/cmake/gauxc-dep-versions.cmake +++ /dev/null @@ -1,26 +0,0 @@ -set( GAUXC_LINALG_MODULES_REPOSITORY https://github.com/wavefunction91/linalg-cmake-modules.git ) -set( GAUXC_LINALG_MODULES_REVISION 9d2c273a671d6811e9fd432f6a4fa3d915b144b8 ) - -set( GAUXC_CEREAL_REPOSITORY https://github.com/USCiLab/cereal.git ) -set( GAUXC_CEREAL_REVISION v1.3.0 ) - -set ( GAUXC_NLOHMANN_JSON_REPOSITORY https://github.com/nlohmann/json.git ) -set ( GAUXC_NLOHMANN_JSON_REVISION v3.12.0 ) - -set( GAUXC_CUB_REPOSITORY https://github.com/NVIDIA/cub.git ) -set( GAUXC_CUB_REVISION 1.10.0 ) - -set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git ) -set( GAUXC_CUTLASS_REVISION v2.10.0 ) - -set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git ) -set( GAUXC_EXCHCXX_REVISION v1.0.0 ) - -set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git ) -set( GAUXC_GAU2GRID_REVISION v2.0.6 ) - -set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/wavefunction91/IntegratorXX.git ) -set( GAUXC_INTEGRATORXX_REVISION 1369be58d7a3235dac36d75dd964fef058830622 ) - -set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/BlueBrain/HighFive.git ) -set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 ) \ No newline at end of file diff --git a/third_party/gauxc/cmake/gauxc-eigen3.cmake b/third_party/gauxc/cmake/gauxc-eigen3.cmake deleted file mode 100644 index c775472..0000000 --- a/third_party/gauxc/cmake/gauxc-eigen3.cmake +++ /dev/null @@ -1,25 +0,0 @@ -find_package( Eigen3 CONFIG HINTS ${EIGEN3_ROOT_DIR} ) -if( NOT Eigen3_FOUND ) - - message( STATUS "Could Not Find Eigen3... Building" ) - message( STATUS "EIGEN3 REPO = https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz" ) - #message( STATUS "EIGEN3 REV = " ) - - FetchContent_Declare( - eigen3 - URL https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz - ) - - FetchContent_GetProperties( eigen3 ) - if( NOT eigen3_POPULATED ) - FetchContent_Populate( eigen3 ) - endif() - - #message( FATAL_ERROR "Eigen3 Pull Not Yet Configured" ) - add_library( Eigen3::Eigen INTERFACE IMPORTED ) - set_target_properties( Eigen3::Eigen PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES ${eigen3_SOURCE_DIR} - ) - -endif() - diff --git a/third_party/gauxc/cmake/gauxc-exchcxx.cmake b/third_party/gauxc/cmake/gauxc-exchcxx.cmake deleted file mode 100644 index 412df9b..0000000 --- a/third_party/gauxc/cmake/gauxc-exchcxx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -find_package( ExchCXX QUIET ) -if( NOT ${ExchCXX_FOUND} ) - - include( gauxc-dep-versions ) - - message( STATUS "Could not find ExchCXX... Building" ) - message( STATUS "EXCHCXX REPO = ${GAUXC_EXCHCXX_REPOSITORY}" ) - message( STATUS "EXCHCXX REV = ${GAUXC_EXCHCXX_REVISION}" ) - - set( EXCHCXX_ENABLE_CUDA ${GAUXC_HAS_CUDA} CACHE BOOL "" ) - set( EXCHCXX_ENABLE_HIP ${GAUXC_HAS_HIP} CACHE BOOL "" ) - set( EXCHCXX_ENABLE_TESTS OFF CACHE BOOL "" ) - - FetchContent_Declare( - exchcxx - GIT_REPOSITORY ${GAUXC_EXCHCXX_REPOSITORY} - GIT_TAG ${GAUXC_EXCHCXX_REVISION} - ) - - FetchContent_MakeAvailable( exchcxx ) - - -else() - - if( ${GAUXC_HAS_CUDA} AND NOT ${EXCHCXX_ENABLE_CUDA} ) - message( FATAL_ERROR "GauXC CUDA BINDINGS REQUIRE ExchCXX CUDA Bindings" ) - endif() - - if( ${GAUXC_HAS_HIP} AND NOT ${EXCHCXX_ENABLE_HIP} ) - message( FATAL_ERROR "GauXC HIP BINDINGS REQUIRE ExchCXX HIP Bindings" ) - endif() - -endif() - - diff --git a/third_party/gauxc/cmake/gauxc-gau2grid.cmake b/third_party/gauxc/cmake/gauxc-gau2grid.cmake deleted file mode 100644 index 51db34d..0000000 --- a/third_party/gauxc/cmake/gauxc-gau2grid.cmake +++ /dev/null @@ -1,43 +0,0 @@ -if( GAUXC_ENABLE_GAU2GRID ) - if( NOT TARGET gau2grid::gg ) - - # First try to find the package if target doesn't exist - find_package( gau2grid CONFIG QUIET ) - - if( NOT gau2grid_FOUND ) - - message( STATUS "Could not find Gau2grid... Building" ) - - if( GAUXC_FORCE_EXTERNAL_GAU2GRID ) - - include( gauxc-dep-versions ) - - message( STATUS "GAU2GRID REPO = ${GAUXC_GAU2GRID_REPOSITORY}" ) - message( STATUS "GAU2GRID REV = ${GAUXC_GAU2GRID_REVISION}" ) - - FetchContent_Declare( - gau2grid - GIT_REPOSITORY ${GAUXC_GAU2GRID_REPOSITORY} - GIT_TAG ${GAUXC_GAU2GRID_REVISION} - ) - - set( MAX_AM 6 CACHE STRING "" ) - set( DISABLE_PRAGMA ON CACHE BOOL "" ) - FetchContent_MakeAvailable( gau2grid ) - - if( NOT TARGET gau2grid::gg ) - message( STATUS "Something Went Horribly Wrong With Gau2Grid discovery!" ) - endif() - - else() - - message( STATUS "Building Pregenerated Gau2grid" ) - add_subdirectory( ${PROJECT_SOURCE_DIR}/external/gau2grid ${PROJECT_BINARY_DIR}/external/gau2grid ) - - endif() - - endif() # If not discoverable - endif() # If target not present - - set(GAUXC_HAS_GAU2GRID TRUE CACHE BOOL "GauXC has Gau2Grid and will build bindings" FORCE) -endif() # If enabled diff --git a/third_party/gauxc/cmake/gauxc-integratorxx.cmake b/third_party/gauxc/cmake/gauxc-integratorxx.cmake deleted file mode 100644 index b6bbbf0..0000000 --- a/third_party/gauxc/cmake/gauxc-integratorxx.cmake +++ /dev/null @@ -1,21 +0,0 @@ -find_package( IntegratorXX QUIET ) -if( NOT ${IntegratorXX_FOUND} ) - - include( gauxc-dep-versions ) - - message( STATUS "Could not find IntegratorXX... Building" ) - message( STATUS "INTEGRATORXX REPO = ${GAUXC_INTEGRATORXX_REPOSITORY}" ) - message( STATUS "INTEGRATORXX REV = ${GAUXC_INTEGRATORXX_REVISION}" ) - - set( INTEGRATORXX_ENABLE_TESTS OFF CACHE BOOL "" ) - FetchContent_Declare( - integratorxx - GIT_REPOSITORY ${GAUXC_INTEGRATORXX_REPOSITORY} - GIT_TAG ${GAUXC_INTEGRATORXX_REVISION} - ) - - FetchContent_MakeAvailable( integratorxx ) - -endif() - - diff --git a/third_party/gauxc/cmake/gauxc-linalg-modules.cmake b/third_party/gauxc/cmake/gauxc-linalg-modules.cmake deleted file mode 100644 index 69a69a7..0000000 --- a/third_party/gauxc/cmake/gauxc-linalg-modules.cmake +++ /dev/null @@ -1,11 +0,0 @@ -include( FetchContent ) -include( gauxc-dep-versions ) -FetchContent_Declare( linalg-cmake-modules - GIT_REPOSITORY ${GAUXC_LINALG_MODULES_REPOSITORY} - GIT_TAG ${GAUXC_LINALG_MODULES_REVISION} -) -FetchContent_GetProperties( linalg-cmake-modules ) -if( NOT linalg-cmake-modules_POPULATED ) - FetchContent_Populate( linalg-cmake-modules ) - list( PREPEND CMAKE_MODULE_PATH ${linalg-cmake-modules_SOURCE_DIR} ) -endif() diff --git a/third_party/gauxc/cmake/gauxc-onedft.cmake b/third_party/gauxc/cmake/gauxc-onedft.cmake deleted file mode 100644 index 7003d54..0000000 --- a/third_party/gauxc/cmake/gauxc-onedft.cmake +++ /dev/null @@ -1,32 +0,0 @@ -find_package(nlohmann_json) -if( NOT nlohmann_json_FOUND ) - - message( STATUS "Could Not Find nlohmann_json... Building" ) - message( STATUS "NLOHMANN_JSON REPO = ${GAUXC_NLOHMANN_JSON_REPOSITORY}" ) - - FetchContent_Declare( - nlohmann_json - GIT_REPOSITORY ${GAUXC_NLOHMANN_JSON_REPOSITORY} - GIT_TAG ${GAUXC_NLOHMANN_JSON_REVISION} - ) - - FetchContent_GetProperties( nlohmann_json ) - if( NOT nlohmann_json_POPULATED ) - FetchContent_Populate( nlohmann_json ) - endif() - - add_library( nlohmann_json::nlohmann_json INTERFACE IMPORTED ) - set_target_properties( nlohmann_json::nlohmann_json PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES ${nlohmann_json_SOURCE_DIR}/include - ) -endif() - -# store and restore CMAKE_CUDA_ARCHITECTURES if Torch clobbers it -set(_PREV_CUDA_ARCHS "${CMAKE_CUDA_ARCHITECTURES}") -find_package(Torch REQUIRED) -if(CMAKE_CUDA_ARCHITECTURES STREQUAL "OFF") - set(CMAKE_CUDA_ARCHITECTURES "${_PREV_CUDA_ARCHS}" CACHE STRING "Restore CUDA archs after Torch override" FORCE) - message(WARNING "Torch set CMAKE_CUDA_ARCHITECTURES to OFF. Restored previous value: ${CMAKE_CUDA_ARCHITECTURES}") -endif() -list(REMOVE_ITEM TORCH_LIBRARIES torch::nvtoolsext) -message(STATUS "Torch libraries without nvtoolsext: ${TORCH_LIBRARIES}") diff --git a/third_party/gauxc/cmake/modules/FindMAGMA.cmake b/third_party/gauxc/cmake/modules/FindMAGMA.cmake deleted file mode 100644 index 8c24d7b..0000000 --- a/third_party/gauxc/cmake/modules/FindMAGMA.cmake +++ /dev/null @@ -1,43 +0,0 @@ -if( NOT DEFINED MAGMA_ROOT_DIR ) - find_package(PkgConfig) - pkg_check_modules( PC_MAGMA magma ) -endif() - -if( NOT MAGMA_INCLUDE_DIR ) -find_path( MAGMA_INCLUDE_DIR magma.h - HINTS ${PC_MAGMA_INCLUDEDIR} - ${PC_MAGMA_INCLUDE_DIRS} - PATHS ${MAGMA_ROOT_DIR} - PATH_SUFFIXES include -) -endif() - -if(NOT MAGMA_LIBRARY) -find_library( MAGMA_LIBRARY NAMES magma - HINTS ${PC_MAGMA_LIBDIR} - ${PC_MAGMA_LIBRARY_DIRS} - PATHS ${MAGMA_ROOT_DIR} - PATH_SUFFIXES lib lib64 lib32 -) -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - MAGMA DEFAULT_MSG - MAGMA_LIBRARY - MAGMA_INCLUDE_DIR -) - -if( MAGMA_FOUND AND NOT TARGET MAGMA::magma ) - - set( MAGMA_INCLUDE_DIRS ${MAGMA_INCLUDE_DIR} ) - set( MAGMA_LIBRARIES ${MAGMA_LIBRARY} ) - - add_library( MAGMA::magma INTERFACE IMPORTED ) - set_target_properties( MAGMA::magma PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${MAGMA_INCLUDE_DIRS}" - INTERFACE_LINK_LIBRARIES "${MAGMA_LIBRARIES}" - ) - -endif() - diff --git a/third_party/gauxc/cmake/modules/FindNCCL.cmake b/third_party/gauxc/cmake/modules/FindNCCL.cmake deleted file mode 100644 index 9474ebb..0000000 --- a/third_party/gauxc/cmake/modules/FindNCCL.cmake +++ /dev/null @@ -1,39 +0,0 @@ -if( NOT DEFINED NCCL_ROOT_DIR ) - find_package(PkgConfig) - pkg_check_modules( PC_NCCL QUIET nccl ) -endif() - -find_path( NCCL_INCLUDE_DIR nccl.h - HINTS ${PC_NCCL_INCLUDEDIR} - ${PC_NCCL_INCLUDE_DIRS} - PATHS ${NCCL_ROOT_DIR} - PATH_SUFFIXES include -) - -find_library( NCCL_LIBRARY NAMES nccl - HINTS ${PC_NCCL_LIBDIR} - ${PC_NCCL_LIBRARY_DIRS} - PATHS ${NCCL_ROOT_DIR} - PATH_SUFFIXES lib lib64 lib32 -) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - NCCL DEFAULT_MSG - NCCL_LIBRARY - NCCL_INCLUDE_DIR -) - -if( NCCL_FOUND AND NOT TARGET NCCL::nccl ) - - set( NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR} ) - set( NCCL_LIBRARIES ${NCCL_LIBRARY} ) - - add_library( NCCL::nccl INTERFACE IMPORTED ) - set_target_properties( NCCL::nccl PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIRS}" - INTERFACE_LINK_LIBRARIES "${NCCL_LIBRARIES}" - ) - -endif() - diff --git a/third_party/gauxc/data/onedft_models/lda.fun b/third_party/gauxc/data/onedft_models/lda.fun deleted file mode 100644 index 2b17684..0000000 Binary files a/third_party/gauxc/data/onedft_models/lda.fun and /dev/null differ diff --git a/third_party/gauxc/data/onedft_models/pbe.fun b/third_party/gauxc/data/onedft_models/pbe.fun deleted file mode 100644 index b2228f8..0000000 Binary files a/third_party/gauxc/data/onedft_models/pbe.fun and /dev/null differ diff --git a/third_party/gauxc/data/onedft_models/tpss.fun b/third_party/gauxc/data/onedft_models/tpss.fun deleted file mode 100644 index f2001c8..0000000 Binary files a/third_party/gauxc/data/onedft_models/tpss.fun and /dev/null differ diff --git a/third_party/gauxc/external/gau2grid/CMakeLists.txt b/third_party/gauxc/external/gau2grid/CMakeLists.txt deleted file mode 100644 index ca0638e..0000000 --- a/third_party/gauxc/external/gau2grid/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -# This CMake harness is meant for use with the GauXC library -# and is released under the terms of the 3-clause BSD license - -target_sources( gauxc PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_phi.c - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_orbital.c - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_deriv1.c - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_deriv2.c - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_deriv3.c - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_transform.c - ${CMAKE_CURRENT_SOURCE_DIR}/generated_source/gau2grid_helper.c ) - - -target_compile_definitions( gauxc PRIVATE $ ) -target_include_directories( gauxc - PRIVATE - $ -) diff --git a/third_party/gauxc/external/gau2grid/LICENSE b/third_party/gauxc/external/gau2grid/LICENSE deleted file mode 100644 index 3eba99f..0000000 --- a/third_party/gauxc/external/gau2grid/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2017, Daniel Smith -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/gauxc/external/gau2grid/README.txt b/third_party/gauxc/external/gau2grid/README.txt deleted file mode 100644 index d2d5eae..0000000 --- a/third_party/gauxc/external/gau2grid/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -This folder contains pregenerated source for the gau2grid library for gaussian -collocation evaluation. See LICENSE for library specific terms. diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid.h b/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid.h deleted file mode 100644 index 29f8888..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * BSD 3-Clause License - * - * Copyright (c) 2017, Daniel Smith - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * * Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef GAU2GRID_GUARD_H -#define GAU2GRID_GUARD_H - -#include "gau2grid/gau2grid_pragma.h" - -// Order definitions -#define GG_SPHERICAL_CCA 300 -#define GG_SPHERICAL_GAUSSIAN 301 -#define GG_CARTESIAN_CCA 400 -#define GG_CARTESIAN_MOLDEN 401 -// Information helpers -int gg_max_L(); - -int gg_ncomponents(const int L, const int spherical); - -// Fast transposers -void gg_naive_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, double* PRAGMA_RESTRICT output); -void gg_fast_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, double* PRAGMA_RESTRICT output); - -// Fast segment copiers -void block_copy(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, unsigned long is, double* PRAGMA_RESTRICT output, unsigned long os, const int trans); - - -// Orbitals on a grid -void gg_orbitals(int L, const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -// Collocation matrix functions -void gg_collocation(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_deriv1(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_deriv2(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_deriv3(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -#ifdef __cplusplus -} -#endif -#endif /* GAU2GRID_GUARD_H */ \ No newline at end of file diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h b/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h deleted file mode 100644 index f603388..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - - - -// ISOC11 does not seem to be well implemented across platforms and compilers -// This is a collection of macros to change pragmas and function calls as needed for compat. - -#pragma once - - -#if defined(__GG_NO_PRAGMA) - // Turn everything off if there are issues - - #define ALIGNED_MALLOC(alignment, size) malloc(size) - #define ALIGNED_FREE(ptr) free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE - #define PRAGMA_RESTRICT - -#elif defined(__ICC) || defined(__INTEL_COMPILER) - // pragmas for Intel - - #define ALIGNED_MALLOC(alignment, size) _mm_malloc(size, alignment) - #define ALIGNED_FREE(ptr) _mm_free(ptr) - #define ASSUME_ALIGNED(ptr, width) __assume_aligned(ptr, width) - - #define PRAGMA_VECTORIZE _Pragma("vector") - #define PRAGMA_RESTRICT __restrict__ - -#elif defined(__clang__) && defined(_MSC_VER) - // pragmas for MSVC - - #define ALIGNED_MALLOC(alignment, size) _aligned_malloc(size, alignment) - #define ALIGNED_FREE(ptr) _aligned_free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE __pragma(loop(ivdep)) - #define PRAGMA_RESTRICT __restrict - -#elif defined(__clang__) - // pragmas for Clang. - // Do this before GCC because clang also defines __GNUC__ - - #define ALIGNED_MALLOC(alignment, size) _mm_malloc(size, alignment) - #define ALIGNED_FREE(ptr) _mm_free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE _Pragma("clang loop vectorize(enable)") - #define PRAGMA_RESTRICT __restrict__ - -#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__APPLE__) - // pragmas for GCC on Darwin (weird aligned alloc not found on Darwin) - - #define ALIGNED_MALLOC(alignment, size) malloc(size) - #define ALIGNED_FREE(ptr) free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE _Pragma("GCC ivdep") - #define PRAGMA_RESTRICT __restrict__ - -#elif defined(__GNUC__) || defined(__GNUG__) - // pragmas for GCC - - #define ALIGNED_MALLOC(alignment, size) aligned_alloc(alignment, size) - #define ALIGNED_FREE(ptr) free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE _Pragma("GCC ivdep") - #define PRAGMA_RESTRICT __restrict__ - -#elif defined(_MSC_VER) - // pragmas for MSVC - - #define ALIGNED_MALLOC(alignment, size) _aligned_malloc(size, alignment) - #define ALIGNED_FREE(ptr) _aligned_free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE __pragma(loop(ivdep)) - #define PRAGMA_RESTRICT __restrict - - -#elif defined(__PGI) - // pragmas for PGI - - #define ALIGNED_MALLOC(alignment, size) aligned_alloc(alignment, size) - #define ALIGNED_FREE(ptr) free(ptr) - #define ASSUME_ALIGNED(ptr, width) - - #define PRAGMA_VECTORIZE _Pragma("ivdep") - #define PRAGMA_RESTRICT __restrict__ - - -#endif \ No newline at end of file diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid_utility.h b/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid_utility.h deleted file mode 100644 index 3039bb9..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid/gau2grid_utility.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -// Spherical transformers -void gg_cca_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L0(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L1(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L2(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L3(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L4(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L5(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_cca_cart_to_spherical_sum_L6(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L0(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L1(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L2(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L3(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L4(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L5(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical); - -void gg_gaussian_cart_to_spherical_sum_L6(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical); - -void gg_cca_cart_copy_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_copy_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_copy_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_copy_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_copy_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_copy_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_copy_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_cca_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_copy_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - -void gg_molden_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out); - - -// Fast matrix vector block sum -void block_matrix_vector(unsigned long n, unsigned long m, const double* vector, const double* PRAGMA_RESTRICT input, unsigned long is, double* PRAGMA_RESTRICT output); -// Orbital computers -void gg_orbitals_L0(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -void gg_orbitals_L1(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -void gg_orbitals_L2(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -void gg_orbitals_L3(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -void gg_orbitals_L4(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -void gg_orbitals_L5(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -void gg_orbitals_L6(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out); - -// Phi computers -void gg_collocation_L0(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_L1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_L2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_L3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_L4(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_L5(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -void gg_collocation_L6(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out); - -// Phi grad computers -void gg_collocation_L0_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_L1_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_L2_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_L3_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_L4_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_L5_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -void gg_collocation_L6_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out); - -// Phi Hess computers -void gg_collocation_L0_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_L1_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_L2_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_L3_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_L4_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_L5_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -void gg_collocation_L6_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out); - -// Phi Der3 computers -void gg_collocation_L0_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -void gg_collocation_L1_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -void gg_collocation_L2_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -void gg_collocation_L3_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -void gg_collocation_L4_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -void gg_collocation_L5_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); - -void gg_collocation_L6_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out); diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv1.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv1.c deleted file mode 100644 index 503c165..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv1.c +++ /dev/null @@ -1,2382 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - - - -void gg_collocation_L0_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 1; - const unsigned long nspherical = 1; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - phi_out[start + i] = S0[i]; - - // Gradient AM=0 Component=0 - phi_x_out[start + i] = SX; - phi_y_out[start + i] = SY; - phi_z_out[start + i] = SZ; - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} - -void gg_collocation_L1_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 3; - const unsigned long nspherical = 3; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Density AM=1 Component=X - phi_tmp[i] = S0[i] * xc[i]; - - // Gradient AM=1 Component=X - phi_x_tmp[i] = SX * xc[i]; - phi_y_tmp[i] = SY * xc[i]; - phi_z_tmp[i] = SZ * xc[i]; - phi_x_tmp[i] += S0[i]; - - // Density AM=1 Component=Y - phi_tmp[32 + i] = S0[i] * yc[i]; - - // Gradient AM=1 Component=Y - phi_x_tmp[32 + i] = SX * yc[i]; - phi_y_tmp[32 + i] = SY * yc[i]; - phi_z_tmp[32 + i] = SZ * yc[i]; - phi_y_tmp[32 + i] += S0[i]; - - // Density AM=1 Component=Z - phi_tmp[64 + i] = S0[i] * zc[i]; - - // Gradient AM=1 Component=Z - phi_x_tmp[64 + i] = SX * zc[i]; - phi_y_tmp[64 + i] = SY * zc[i]; - phi_z_tmp[64 + i] = SZ * zc[i]; - phi_z_tmp[64 + i] += S0[i]; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} - -void gg_collocation_L2_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 6; - const unsigned long nspherical = 5; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - - // Density AM=2 Component=XX - phi_tmp[i] = S0[i] * xc_pow2; - - // Gradient AM=2 Component=XX - phi_x_tmp[i] = SX * xc_pow2; - phi_y_tmp[i] = SY * xc_pow2; - phi_z_tmp[i] = SZ * xc_pow2; - AX = 2.0 * xc[i]; - phi_x_tmp[i] += S0[i] * AX; - - // Density AM=2 Component=XY - A = xc[i] * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Gradient AM=2 Component=XY - phi_x_tmp[32 + i] = SX * A; - phi_y_tmp[32 + i] = SY * A; - phi_z_tmp[32 + i] = SZ * A; - phi_x_tmp[32 + i] += S0[i] * yc[i]; - phi_y_tmp[32 + i] += S0[i] * xc[i]; - - // Density AM=2 Component=XZ - A = xc[i] * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Gradient AM=2 Component=XZ - phi_x_tmp[64 + i] = SX * A; - phi_y_tmp[64 + i] = SY * A; - phi_z_tmp[64 + i] = SZ * A; - phi_x_tmp[64 + i] += S0[i] * zc[i]; - phi_z_tmp[64 + i] += S0[i] * xc[i]; - - // Density AM=2 Component=YY - phi_tmp[96 + i] = S0[i] * yc_pow2; - - // Gradient AM=2 Component=YY - phi_x_tmp[96 + i] = SX * yc_pow2; - phi_y_tmp[96 + i] = SY * yc_pow2; - phi_z_tmp[96 + i] = SZ * yc_pow2; - AY = 2.0 * yc[i]; - phi_y_tmp[96 + i] += S0[i] * AY; - - // Density AM=2 Component=YZ - A = yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Gradient AM=2 Component=YZ - phi_x_tmp[128 + i] = SX * A; - phi_y_tmp[128 + i] = SY * A; - phi_z_tmp[128 + i] = SZ * A; - phi_y_tmp[128 + i] += S0[i] * zc[i]; - phi_z_tmp[128 + i] += S0[i] * yc[i]; - - // Density AM=2 Component=ZZ - phi_tmp[160 + i] = S0[i] * zc_pow2; - - // Gradient AM=2 Component=ZZ - phi_x_tmp[160 + i] = SX * zc_pow2; - phi_y_tmp[160 + i] = SY * zc_pow2; - phi_z_tmp[160 + i] = SZ * zc_pow2; - AZ = 2.0 * zc[i]; - phi_z_tmp[160 + i] += S0[i] * AZ; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} - -void gg_collocation_L3_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 10; - const unsigned long nspherical = 7; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - - // Density AM=3 Component=XXX - phi_tmp[i] = S0[i] * xc_pow3; - - // Gradient AM=3 Component=XXX - phi_x_tmp[i] = SX * xc_pow3; - phi_y_tmp[i] = SY * xc_pow3; - phi_z_tmp[i] = SZ * xc_pow3; - AX = 3.0 * xc_pow2; - phi_x_tmp[i] += S0[i] * AX; - - // Density AM=3 Component=XXY - A = xc_pow2 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Gradient AM=3 Component=XXY - phi_x_tmp[32 + i] = SX * A; - phi_y_tmp[32 + i] = SY * A; - phi_z_tmp[32 + i] = SZ * A; - AX = 2.0 * xc[i] * yc[i]; - phi_x_tmp[32 + i] += S0[i] * AX; - phi_y_tmp[32 + i] += S0[i] * xc_pow2; - - // Density AM=3 Component=XXZ - A = xc_pow2 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Gradient AM=3 Component=XXZ - phi_x_tmp[64 + i] = SX * A; - phi_y_tmp[64 + i] = SY * A; - phi_z_tmp[64 + i] = SZ * A; - AX = 2.0 * xc[i] * zc[i]; - phi_x_tmp[64 + i] += S0[i] * AX; - phi_z_tmp[64 + i] += S0[i] * xc_pow2; - - // Density AM=3 Component=XYY - A = xc[i] * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Gradient AM=3 Component=XYY - phi_x_tmp[96 + i] = SX * A; - phi_y_tmp[96 + i] = SY * A; - phi_z_tmp[96 + i] = SZ * A; - phi_x_tmp[96 + i] += S0[i] * yc_pow2; - AY = 2.0 * xc[i] * yc[i]; - phi_y_tmp[96 + i] += S0[i] * AY; - - // Density AM=3 Component=XYZ - A = xc[i] * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Gradient AM=3 Component=XYZ - phi_x_tmp[128 + i] = SX * A; - phi_y_tmp[128 + i] = SY * A; - phi_z_tmp[128 + i] = SZ * A; - AX = yc[i] * zc[i]; - phi_x_tmp[128 + i] += S0[i] * AX; - AY = xc[i] * zc[i]; - phi_y_tmp[128 + i] += S0[i] * AY; - AZ = xc[i] * yc[i]; - phi_z_tmp[128 + i] += S0[i] * AZ; - - // Density AM=3 Component=XZZ - A = xc[i] * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Gradient AM=3 Component=XZZ - phi_x_tmp[160 + i] = SX * A; - phi_y_tmp[160 + i] = SY * A; - phi_z_tmp[160 + i] = SZ * A; - phi_x_tmp[160 + i] += S0[i] * zc_pow2; - AZ = 2.0 * xc[i] * zc[i]; - phi_z_tmp[160 + i] += S0[i] * AZ; - - // Density AM=3 Component=YYY - phi_tmp[192 + i] = S0[i] * yc_pow3; - - // Gradient AM=3 Component=YYY - phi_x_tmp[192 + i] = SX * yc_pow3; - phi_y_tmp[192 + i] = SY * yc_pow3; - phi_z_tmp[192 + i] = SZ * yc_pow3; - AY = 3.0 * yc_pow2; - phi_y_tmp[192 + i] += S0[i] * AY; - - // Density AM=3 Component=YYZ - A = yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Gradient AM=3 Component=YYZ - phi_x_tmp[224 + i] = SX * A; - phi_y_tmp[224 + i] = SY * A; - phi_z_tmp[224 + i] = SZ * A; - AY = 2.0 * yc[i] * zc[i]; - phi_y_tmp[224 + i] += S0[i] * AY; - phi_z_tmp[224 + i] += S0[i] * yc_pow2; - - // Density AM=3 Component=YZZ - A = yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Gradient AM=3 Component=YZZ - phi_x_tmp[256 + i] = SX * A; - phi_y_tmp[256 + i] = SY * A; - phi_z_tmp[256 + i] = SZ * A; - phi_y_tmp[256 + i] += S0[i] * zc_pow2; - AZ = 2.0 * yc[i] * zc[i]; - phi_z_tmp[256 + i] += S0[i] * AZ; - - // Density AM=3 Component=ZZZ - phi_tmp[288 + i] = S0[i] * zc_pow3; - - // Gradient AM=3 Component=ZZZ - phi_x_tmp[288 + i] = SX * zc_pow3; - phi_y_tmp[288 + i] = SY * zc_pow3; - phi_z_tmp[288 + i] = SZ * zc_pow3; - AZ = 3.0 * zc_pow2; - phi_z_tmp[288 + i] += S0[i] * AZ; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L3(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L3(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L3(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L3(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L3(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L3(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L3(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L3(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L3(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L3(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L3(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L3(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} - -void gg_collocation_L4_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 15; - const unsigned long nspherical = 9; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - - // Density AM=4 Component=XXXX - phi_tmp[i] = S0[i] * xc_pow4; - - // Gradient AM=4 Component=XXXX - phi_x_tmp[i] = SX * xc_pow4; - phi_y_tmp[i] = SY * xc_pow4; - phi_z_tmp[i] = SZ * xc_pow4; - AX = 4.0 * xc_pow3; - phi_x_tmp[i] += S0[i] * AX; - - // Density AM=4 Component=XXXY - A = xc_pow3 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Gradient AM=4 Component=XXXY - phi_x_tmp[32 + i] = SX * A; - phi_y_tmp[32 + i] = SY * A; - phi_z_tmp[32 + i] = SZ * A; - AX = 3.0 * xc_pow2 * yc[i]; - phi_x_tmp[32 + i] += S0[i] * AX; - phi_y_tmp[32 + i] += S0[i] * xc_pow3; - - // Density AM=4 Component=XXXZ - A = xc_pow3 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Gradient AM=4 Component=XXXZ - phi_x_tmp[64 + i] = SX * A; - phi_y_tmp[64 + i] = SY * A; - phi_z_tmp[64 + i] = SZ * A; - AX = 3.0 * xc_pow2 * zc[i]; - phi_x_tmp[64 + i] += S0[i] * AX; - phi_z_tmp[64 + i] += S0[i] * xc_pow3; - - // Density AM=4 Component=XXYY - A = xc_pow2 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Gradient AM=4 Component=XXYY - phi_x_tmp[96 + i] = SX * A; - phi_y_tmp[96 + i] = SY * A; - phi_z_tmp[96 + i] = SZ * A; - AX = 2.0 * xc[i] * yc_pow2; - phi_x_tmp[96 + i] += S0[i] * AX; - AY = 2.0 * xc_pow2 * yc[i]; - phi_y_tmp[96 + i] += S0[i] * AY; - - // Density AM=4 Component=XXYZ - A = xc_pow2 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Gradient AM=4 Component=XXYZ - phi_x_tmp[128 + i] = SX * A; - phi_y_tmp[128 + i] = SY * A; - phi_z_tmp[128 + i] = SZ * A; - AX = 2.0 * xc[i] * yc[i] * zc[i]; - phi_x_tmp[128 + i] += S0[i] * AX; - AY = xc_pow2 * zc[i]; - phi_y_tmp[128 + i] += S0[i] * AY; - AZ = xc_pow2 * yc[i]; - phi_z_tmp[128 + i] += S0[i] * AZ; - - // Density AM=4 Component=XXZZ - A = xc_pow2 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Gradient AM=4 Component=XXZZ - phi_x_tmp[160 + i] = SX * A; - phi_y_tmp[160 + i] = SY * A; - phi_z_tmp[160 + i] = SZ * A; - AX = 2.0 * xc[i] * zc_pow2; - phi_x_tmp[160 + i] += S0[i] * AX; - AZ = 2.0 * xc_pow2 * zc[i]; - phi_z_tmp[160 + i] += S0[i] * AZ; - - // Density AM=4 Component=XYYY - A = xc[i] * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Gradient AM=4 Component=XYYY - phi_x_tmp[192 + i] = SX * A; - phi_y_tmp[192 + i] = SY * A; - phi_z_tmp[192 + i] = SZ * A; - phi_x_tmp[192 + i] += S0[i] * yc_pow3; - AY = 3.0 * xc[i] * yc_pow2; - phi_y_tmp[192 + i] += S0[i] * AY; - - // Density AM=4 Component=XYYZ - A = xc[i] * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Gradient AM=4 Component=XYYZ - phi_x_tmp[224 + i] = SX * A; - phi_y_tmp[224 + i] = SY * A; - phi_z_tmp[224 + i] = SZ * A; - AX = yc_pow2 * zc[i]; - phi_x_tmp[224 + i] += S0[i] * AX; - AY = 2.0 * xc[i] * yc[i] * zc[i]; - phi_y_tmp[224 + i] += S0[i] * AY; - AZ = xc[i] * yc_pow2; - phi_z_tmp[224 + i] += S0[i] * AZ; - - // Density AM=4 Component=XYZZ - A = xc[i] * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Gradient AM=4 Component=XYZZ - phi_x_tmp[256 + i] = SX * A; - phi_y_tmp[256 + i] = SY * A; - phi_z_tmp[256 + i] = SZ * A; - AX = yc[i] * zc_pow2; - phi_x_tmp[256 + i] += S0[i] * AX; - AY = xc[i] * zc_pow2; - phi_y_tmp[256 + i] += S0[i] * AY; - AZ = 2.0 * xc[i] * yc[i] * zc[i]; - phi_z_tmp[256 + i] += S0[i] * AZ; - - // Density AM=4 Component=XZZZ - A = xc[i] * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Gradient AM=4 Component=XZZZ - phi_x_tmp[288 + i] = SX * A; - phi_y_tmp[288 + i] = SY * A; - phi_z_tmp[288 + i] = SZ * A; - phi_x_tmp[288 + i] += S0[i] * zc_pow3; - AZ = 3.0 * xc[i] * zc_pow2; - phi_z_tmp[288 + i] += S0[i] * AZ; - - // Density AM=4 Component=YYYY - phi_tmp[320 + i] = S0[i] * yc_pow4; - - // Gradient AM=4 Component=YYYY - phi_x_tmp[320 + i] = SX * yc_pow4; - phi_y_tmp[320 + i] = SY * yc_pow4; - phi_z_tmp[320 + i] = SZ * yc_pow4; - AY = 4.0 * yc_pow3; - phi_y_tmp[320 + i] += S0[i] * AY; - - // Density AM=4 Component=YYYZ - A = yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Gradient AM=4 Component=YYYZ - phi_x_tmp[352 + i] = SX * A; - phi_y_tmp[352 + i] = SY * A; - phi_z_tmp[352 + i] = SZ * A; - AY = 3.0 * yc_pow2 * zc[i]; - phi_y_tmp[352 + i] += S0[i] * AY; - phi_z_tmp[352 + i] += S0[i] * yc_pow3; - - // Density AM=4 Component=YYZZ - A = yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Gradient AM=4 Component=YYZZ - phi_x_tmp[384 + i] = SX * A; - phi_y_tmp[384 + i] = SY * A; - phi_z_tmp[384 + i] = SZ * A; - AY = 2.0 * yc[i] * zc_pow2; - phi_y_tmp[384 + i] += S0[i] * AY; - AZ = 2.0 * yc_pow2 * zc[i]; - phi_z_tmp[384 + i] += S0[i] * AZ; - - // Density AM=4 Component=YZZZ - A = yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Gradient AM=4 Component=YZZZ - phi_x_tmp[416 + i] = SX * A; - phi_y_tmp[416 + i] = SY * A; - phi_z_tmp[416 + i] = SZ * A; - phi_y_tmp[416 + i] += S0[i] * zc_pow3; - AZ = 3.0 * yc[i] * zc_pow2; - phi_z_tmp[416 + i] += S0[i] * AZ; - - // Density AM=4 Component=ZZZZ - phi_tmp[448 + i] = S0[i] * zc_pow4; - - // Gradient AM=4 Component=ZZZZ - phi_x_tmp[448 + i] = SX * zc_pow4; - phi_y_tmp[448 + i] = SY * zc_pow4; - phi_z_tmp[448 + i] = SZ * zc_pow4; - AZ = 4.0 * zc_pow3; - phi_z_tmp[448 + i] += S0[i] * AZ; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L4(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L4(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L4(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L4(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L4(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L4(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L4(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L4(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L4(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L4(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L4(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L4(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} - -void gg_collocation_L5_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 21; - const unsigned long nspherical = 11; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - const double xc_pow5 = xc_pow4 * xc[i]; - const double yc_pow5 = yc_pow4 * yc[i]; - const double zc_pow5 = zc_pow4 * zc[i]; - - - // Density AM=5 Component=XXXXX - phi_tmp[i] = S0[i] * xc_pow5; - - // Gradient AM=5 Component=XXXXX - phi_x_tmp[i] = SX * xc_pow5; - phi_y_tmp[i] = SY * xc_pow5; - phi_z_tmp[i] = SZ * xc_pow5; - AX = 5.0 * xc_pow4; - phi_x_tmp[i] += S0[i] * AX; - - // Density AM=5 Component=XXXXY - A = xc_pow4 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXXXY - phi_x_tmp[32 + i] = SX * A; - phi_y_tmp[32 + i] = SY * A; - phi_z_tmp[32 + i] = SZ * A; - AX = 4.0 * xc_pow3 * yc[i]; - phi_x_tmp[32 + i] += S0[i] * AX; - phi_y_tmp[32 + i] += S0[i] * xc_pow4; - - // Density AM=5 Component=XXXXZ - A = xc_pow4 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXXXZ - phi_x_tmp[64 + i] = SX * A; - phi_y_tmp[64 + i] = SY * A; - phi_z_tmp[64 + i] = SZ * A; - AX = 4.0 * xc_pow3 * zc[i]; - phi_x_tmp[64 + i] += S0[i] * AX; - phi_z_tmp[64 + i] += S0[i] * xc_pow4; - - // Density AM=5 Component=XXXYY - A = xc_pow3 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXXYY - phi_x_tmp[96 + i] = SX * A; - phi_y_tmp[96 + i] = SY * A; - phi_z_tmp[96 + i] = SZ * A; - AX = 3.0 * xc_pow2 * yc_pow2; - phi_x_tmp[96 + i] += S0[i] * AX; - AY = 2.0 * xc_pow3 * yc[i]; - phi_y_tmp[96 + i] += S0[i] * AY; - - // Density AM=5 Component=XXXYZ - A = xc_pow3 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXXYZ - phi_x_tmp[128 + i] = SX * A; - phi_y_tmp[128 + i] = SY * A; - phi_z_tmp[128 + i] = SZ * A; - AX = 3.0 * xc_pow2 * yc[i] * zc[i]; - phi_x_tmp[128 + i] += S0[i] * AX; - AY = xc_pow3 * zc[i]; - phi_y_tmp[128 + i] += S0[i] * AY; - AZ = xc_pow3 * yc[i]; - phi_z_tmp[128 + i] += S0[i] * AZ; - - // Density AM=5 Component=XXXZZ - A = xc_pow3 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXXZZ - phi_x_tmp[160 + i] = SX * A; - phi_y_tmp[160 + i] = SY * A; - phi_z_tmp[160 + i] = SZ * A; - AX = 3.0 * xc_pow2 * zc_pow2; - phi_x_tmp[160 + i] += S0[i] * AX; - AZ = 2.0 * xc_pow3 * zc[i]; - phi_z_tmp[160 + i] += S0[i] * AZ; - - // Density AM=5 Component=XXYYY - A = xc_pow2 * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXYYY - phi_x_tmp[192 + i] = SX * A; - phi_y_tmp[192 + i] = SY * A; - phi_z_tmp[192 + i] = SZ * A; - AX = 2.0 * xc[i] * yc_pow3; - phi_x_tmp[192 + i] += S0[i] * AX; - AY = 3.0 * xc_pow2 * yc_pow2; - phi_y_tmp[192 + i] += S0[i] * AY; - - // Density AM=5 Component=XXYYZ - A = xc_pow2 * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXYYZ - phi_x_tmp[224 + i] = SX * A; - phi_y_tmp[224 + i] = SY * A; - phi_z_tmp[224 + i] = SZ * A; - AX = 2.0 * xc[i] * yc_pow2 * zc[i]; - phi_x_tmp[224 + i] += S0[i] * AX; - AY = 2.0 * xc_pow2 * yc[i] * zc[i]; - phi_y_tmp[224 + i] += S0[i] * AY; - AZ = xc_pow2 * yc_pow2; - phi_z_tmp[224 + i] += S0[i] * AZ; - - // Density AM=5 Component=XXYZZ - A = xc_pow2 * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXYZZ - phi_x_tmp[256 + i] = SX * A; - phi_y_tmp[256 + i] = SY * A; - phi_z_tmp[256 + i] = SZ * A; - AX = 2.0 * xc[i] * yc[i] * zc_pow2; - phi_x_tmp[256 + i] += S0[i] * AX; - AY = xc_pow2 * zc_pow2; - phi_y_tmp[256 + i] += S0[i] * AY; - AZ = 2.0 * xc_pow2 * yc[i] * zc[i]; - phi_z_tmp[256 + i] += S0[i] * AZ; - - // Density AM=5 Component=XXZZZ - A = xc_pow2 * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Gradient AM=5 Component=XXZZZ - phi_x_tmp[288 + i] = SX * A; - phi_y_tmp[288 + i] = SY * A; - phi_z_tmp[288 + i] = SZ * A; - AX = 2.0 * xc[i] * zc_pow3; - phi_x_tmp[288 + i] += S0[i] * AX; - AZ = 3.0 * xc_pow2 * zc_pow2; - phi_z_tmp[288 + i] += S0[i] * AZ; - - // Density AM=5 Component=XYYYY - A = xc[i] * yc_pow4; - phi_tmp[320 + i] = S0[i] * A; - - // Gradient AM=5 Component=XYYYY - phi_x_tmp[320 + i] = SX * A; - phi_y_tmp[320 + i] = SY * A; - phi_z_tmp[320 + i] = SZ * A; - phi_x_tmp[320 + i] += S0[i] * yc_pow4; - AY = 4.0 * xc[i] * yc_pow3; - phi_y_tmp[320 + i] += S0[i] * AY; - - // Density AM=5 Component=XYYYZ - A = xc[i] * yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Gradient AM=5 Component=XYYYZ - phi_x_tmp[352 + i] = SX * A; - phi_y_tmp[352 + i] = SY * A; - phi_z_tmp[352 + i] = SZ * A; - AX = yc_pow3 * zc[i]; - phi_x_tmp[352 + i] += S0[i] * AX; - AY = 3.0 * xc[i] * yc_pow2 * zc[i]; - phi_y_tmp[352 + i] += S0[i] * AY; - AZ = xc[i] * yc_pow3; - phi_z_tmp[352 + i] += S0[i] * AZ; - - // Density AM=5 Component=XYYZZ - A = xc[i] * yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Gradient AM=5 Component=XYYZZ - phi_x_tmp[384 + i] = SX * A; - phi_y_tmp[384 + i] = SY * A; - phi_z_tmp[384 + i] = SZ * A; - AX = yc_pow2 * zc_pow2; - phi_x_tmp[384 + i] += S0[i] * AX; - AY = 2.0 * xc[i] * yc[i] * zc_pow2; - phi_y_tmp[384 + i] += S0[i] * AY; - AZ = 2.0 * xc[i] * yc_pow2 * zc[i]; - phi_z_tmp[384 + i] += S0[i] * AZ; - - // Density AM=5 Component=XYZZZ - A = xc[i] * yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Gradient AM=5 Component=XYZZZ - phi_x_tmp[416 + i] = SX * A; - phi_y_tmp[416 + i] = SY * A; - phi_z_tmp[416 + i] = SZ * A; - AX = yc[i] * zc_pow3; - phi_x_tmp[416 + i] += S0[i] * AX; - AY = xc[i] * zc_pow3; - phi_y_tmp[416 + i] += S0[i] * AY; - AZ = 3.0 * xc[i] * yc[i] * zc_pow2; - phi_z_tmp[416 + i] += S0[i] * AZ; - - // Density AM=5 Component=XZZZZ - A = xc[i] * zc_pow4; - phi_tmp[448 + i] = S0[i] * A; - - // Gradient AM=5 Component=XZZZZ - phi_x_tmp[448 + i] = SX * A; - phi_y_tmp[448 + i] = SY * A; - phi_z_tmp[448 + i] = SZ * A; - phi_x_tmp[448 + i] += S0[i] * zc_pow4; - AZ = 4.0 * xc[i] * zc_pow3; - phi_z_tmp[448 + i] += S0[i] * AZ; - - // Density AM=5 Component=YYYYY - phi_tmp[480 + i] = S0[i] * yc_pow5; - - // Gradient AM=5 Component=YYYYY - phi_x_tmp[480 + i] = SX * yc_pow5; - phi_y_tmp[480 + i] = SY * yc_pow5; - phi_z_tmp[480 + i] = SZ * yc_pow5; - AY = 5.0 * yc_pow4; - phi_y_tmp[480 + i] += S0[i] * AY; - - // Density AM=5 Component=YYYYZ - A = yc_pow4 * zc[i]; - phi_tmp[512 + i] = S0[i] * A; - - // Gradient AM=5 Component=YYYYZ - phi_x_tmp[512 + i] = SX * A; - phi_y_tmp[512 + i] = SY * A; - phi_z_tmp[512 + i] = SZ * A; - AY = 4.0 * yc_pow3 * zc[i]; - phi_y_tmp[512 + i] += S0[i] * AY; - phi_z_tmp[512 + i] += S0[i] * yc_pow4; - - // Density AM=5 Component=YYYZZ - A = yc_pow3 * zc_pow2; - phi_tmp[544 + i] = S0[i] * A; - - // Gradient AM=5 Component=YYYZZ - phi_x_tmp[544 + i] = SX * A; - phi_y_tmp[544 + i] = SY * A; - phi_z_tmp[544 + i] = SZ * A; - AY = 3.0 * yc_pow2 * zc_pow2; - phi_y_tmp[544 + i] += S0[i] * AY; - AZ = 2.0 * yc_pow3 * zc[i]; - phi_z_tmp[544 + i] += S0[i] * AZ; - - // Density AM=5 Component=YYZZZ - A = yc_pow2 * zc_pow3; - phi_tmp[576 + i] = S0[i] * A; - - // Gradient AM=5 Component=YYZZZ - phi_x_tmp[576 + i] = SX * A; - phi_y_tmp[576 + i] = SY * A; - phi_z_tmp[576 + i] = SZ * A; - AY = 2.0 * yc[i] * zc_pow3; - phi_y_tmp[576 + i] += S0[i] * AY; - AZ = 3.0 * yc_pow2 * zc_pow2; - phi_z_tmp[576 + i] += S0[i] * AZ; - - // Density AM=5 Component=YZZZZ - A = yc[i] * zc_pow4; - phi_tmp[608 + i] = S0[i] * A; - - // Gradient AM=5 Component=YZZZZ - phi_x_tmp[608 + i] = SX * A; - phi_y_tmp[608 + i] = SY * A; - phi_z_tmp[608 + i] = SZ * A; - phi_y_tmp[608 + i] += S0[i] * zc_pow4; - AZ = 4.0 * yc[i] * zc_pow3; - phi_z_tmp[608 + i] += S0[i] * AZ; - - // Density AM=5 Component=ZZZZZ - phi_tmp[640 + i] = S0[i] * zc_pow5; - - // Gradient AM=5 Component=ZZZZZ - phi_x_tmp[640 + i] = SX * zc_pow5; - phi_y_tmp[640 + i] = SY * zc_pow5; - phi_z_tmp[640 + i] = SZ * zc_pow5; - AZ = 5.0 * zc_pow4; - phi_z_tmp[640 + i] += S0[i] * AZ; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L5(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L5(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L5(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L5(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L5(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L5(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L5(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L5(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L5(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L5(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L5(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L5(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} - -void gg_collocation_L6_deriv1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 28; - const unsigned long nspherical = 13; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 224 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - const double xc_pow5 = xc_pow4 * xc[i]; - const double yc_pow5 = yc_pow4 * yc[i]; - const double zc_pow5 = zc_pow4 * zc[i]; - - const double xc_pow6 = xc_pow5 * xc[i]; - const double yc_pow6 = yc_pow5 * yc[i]; - const double zc_pow6 = zc_pow5 * zc[i]; - - - // Density AM=6 Component=XXXXXX - phi_tmp[i] = S0[i] * xc_pow6; - - // Gradient AM=6 Component=XXXXXX - phi_x_tmp[i] = SX * xc_pow6; - phi_y_tmp[i] = SY * xc_pow6; - phi_z_tmp[i] = SZ * xc_pow6; - AX = 6.0 * xc_pow5; - phi_x_tmp[i] += S0[i] * AX; - - // Density AM=6 Component=XXXXXY - A = xc_pow5 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXXXY - phi_x_tmp[32 + i] = SX * A; - phi_y_tmp[32 + i] = SY * A; - phi_z_tmp[32 + i] = SZ * A; - AX = 5.0 * xc_pow4 * yc[i]; - phi_x_tmp[32 + i] += S0[i] * AX; - phi_y_tmp[32 + i] += S0[i] * xc_pow5; - - // Density AM=6 Component=XXXXXZ - A = xc_pow5 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXXXZ - phi_x_tmp[64 + i] = SX * A; - phi_y_tmp[64 + i] = SY * A; - phi_z_tmp[64 + i] = SZ * A; - AX = 5.0 * xc_pow4 * zc[i]; - phi_x_tmp[64 + i] += S0[i] * AX; - phi_z_tmp[64 + i] += S0[i] * xc_pow5; - - // Density AM=6 Component=XXXXYY - A = xc_pow4 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXXYY - phi_x_tmp[96 + i] = SX * A; - phi_y_tmp[96 + i] = SY * A; - phi_z_tmp[96 + i] = SZ * A; - AX = 4.0 * xc_pow3 * yc_pow2; - phi_x_tmp[96 + i] += S0[i] * AX; - AY = 2.0 * xc_pow4 * yc[i]; - phi_y_tmp[96 + i] += S0[i] * AY; - - // Density AM=6 Component=XXXXYZ - A = xc_pow4 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXXYZ - phi_x_tmp[128 + i] = SX * A; - phi_y_tmp[128 + i] = SY * A; - phi_z_tmp[128 + i] = SZ * A; - AX = 4.0 * xc_pow3 * yc[i] * zc[i]; - phi_x_tmp[128 + i] += S0[i] * AX; - AY = xc_pow4 * zc[i]; - phi_y_tmp[128 + i] += S0[i] * AY; - AZ = xc_pow4 * yc[i]; - phi_z_tmp[128 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXXXZZ - A = xc_pow4 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXXZZ - phi_x_tmp[160 + i] = SX * A; - phi_y_tmp[160 + i] = SY * A; - phi_z_tmp[160 + i] = SZ * A; - AX = 4.0 * xc_pow3 * zc_pow2; - phi_x_tmp[160 + i] += S0[i] * AX; - AZ = 2.0 * xc_pow4 * zc[i]; - phi_z_tmp[160 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXXYYY - A = xc_pow3 * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXYYY - phi_x_tmp[192 + i] = SX * A; - phi_y_tmp[192 + i] = SY * A; - phi_z_tmp[192 + i] = SZ * A; - AX = 3.0 * xc_pow2 * yc_pow3; - phi_x_tmp[192 + i] += S0[i] * AX; - AY = 3.0 * xc_pow3 * yc_pow2; - phi_y_tmp[192 + i] += S0[i] * AY; - - // Density AM=6 Component=XXXYYZ - A = xc_pow3 * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXYYZ - phi_x_tmp[224 + i] = SX * A; - phi_y_tmp[224 + i] = SY * A; - phi_z_tmp[224 + i] = SZ * A; - AX = 3.0 * xc_pow2 * yc_pow2 * zc[i]; - phi_x_tmp[224 + i] += S0[i] * AX; - AY = 2.0 * xc_pow3 * yc[i] * zc[i]; - phi_y_tmp[224 + i] += S0[i] * AY; - AZ = xc_pow3 * yc_pow2; - phi_z_tmp[224 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXXYZZ - A = xc_pow3 * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXYZZ - phi_x_tmp[256 + i] = SX * A; - phi_y_tmp[256 + i] = SY * A; - phi_z_tmp[256 + i] = SZ * A; - AX = 3.0 * xc_pow2 * yc[i] * zc_pow2; - phi_x_tmp[256 + i] += S0[i] * AX; - AY = xc_pow3 * zc_pow2; - phi_y_tmp[256 + i] += S0[i] * AY; - AZ = 2.0 * xc_pow3 * yc[i] * zc[i]; - phi_z_tmp[256 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXXZZZ - A = xc_pow3 * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXXZZZ - phi_x_tmp[288 + i] = SX * A; - phi_y_tmp[288 + i] = SY * A; - phi_z_tmp[288 + i] = SZ * A; - AX = 3.0 * xc_pow2 * zc_pow3; - phi_x_tmp[288 + i] += S0[i] * AX; - AZ = 3.0 * xc_pow3 * zc_pow2; - phi_z_tmp[288 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXYYYY - A = xc_pow2 * yc_pow4; - phi_tmp[320 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXYYYY - phi_x_tmp[320 + i] = SX * A; - phi_y_tmp[320 + i] = SY * A; - phi_z_tmp[320 + i] = SZ * A; - AX = 2.0 * xc[i] * yc_pow4; - phi_x_tmp[320 + i] += S0[i] * AX; - AY = 4.0 * xc_pow2 * yc_pow3; - phi_y_tmp[320 + i] += S0[i] * AY; - - // Density AM=6 Component=XXYYYZ - A = xc_pow2 * yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXYYYZ - phi_x_tmp[352 + i] = SX * A; - phi_y_tmp[352 + i] = SY * A; - phi_z_tmp[352 + i] = SZ * A; - AX = 2.0 * xc[i] * yc_pow3 * zc[i]; - phi_x_tmp[352 + i] += S0[i] * AX; - AY = 3.0 * xc_pow2 * yc_pow2 * zc[i]; - phi_y_tmp[352 + i] += S0[i] * AY; - AZ = xc_pow2 * yc_pow3; - phi_z_tmp[352 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXYYZZ - A = xc_pow2 * yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXYYZZ - phi_x_tmp[384 + i] = SX * A; - phi_y_tmp[384 + i] = SY * A; - phi_z_tmp[384 + i] = SZ * A; - AX = 2.0 * xc[i] * yc_pow2 * zc_pow2; - phi_x_tmp[384 + i] += S0[i] * AX; - AY = 2.0 * xc_pow2 * yc[i] * zc_pow2; - phi_y_tmp[384 + i] += S0[i] * AY; - AZ = 2.0 * xc_pow2 * yc_pow2 * zc[i]; - phi_z_tmp[384 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXYZZZ - A = xc_pow2 * yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXYZZZ - phi_x_tmp[416 + i] = SX * A; - phi_y_tmp[416 + i] = SY * A; - phi_z_tmp[416 + i] = SZ * A; - AX = 2.0 * xc[i] * yc[i] * zc_pow3; - phi_x_tmp[416 + i] += S0[i] * AX; - AY = xc_pow2 * zc_pow3; - phi_y_tmp[416 + i] += S0[i] * AY; - AZ = 3.0 * xc_pow2 * yc[i] * zc_pow2; - phi_z_tmp[416 + i] += S0[i] * AZ; - - // Density AM=6 Component=XXZZZZ - A = xc_pow2 * zc_pow4; - phi_tmp[448 + i] = S0[i] * A; - - // Gradient AM=6 Component=XXZZZZ - phi_x_tmp[448 + i] = SX * A; - phi_y_tmp[448 + i] = SY * A; - phi_z_tmp[448 + i] = SZ * A; - AX = 2.0 * xc[i] * zc_pow4; - phi_x_tmp[448 + i] += S0[i] * AX; - AZ = 4.0 * xc_pow2 * zc_pow3; - phi_z_tmp[448 + i] += S0[i] * AZ; - - // Density AM=6 Component=XYYYYY - A = xc[i] * yc_pow5; - phi_tmp[480 + i] = S0[i] * A; - - // Gradient AM=6 Component=XYYYYY - phi_x_tmp[480 + i] = SX * A; - phi_y_tmp[480 + i] = SY * A; - phi_z_tmp[480 + i] = SZ * A; - phi_x_tmp[480 + i] += S0[i] * yc_pow5; - AY = 5.0 * xc[i] * yc_pow4; - phi_y_tmp[480 + i] += S0[i] * AY; - - // Density AM=6 Component=XYYYYZ - A = xc[i] * yc_pow4 * zc[i]; - phi_tmp[512 + i] = S0[i] * A; - - // Gradient AM=6 Component=XYYYYZ - phi_x_tmp[512 + i] = SX * A; - phi_y_tmp[512 + i] = SY * A; - phi_z_tmp[512 + i] = SZ * A; - AX = yc_pow4 * zc[i]; - phi_x_tmp[512 + i] += S0[i] * AX; - AY = 4.0 * xc[i] * yc_pow3 * zc[i]; - phi_y_tmp[512 + i] += S0[i] * AY; - AZ = xc[i] * yc_pow4; - phi_z_tmp[512 + i] += S0[i] * AZ; - - // Density AM=6 Component=XYYYZZ - A = xc[i] * yc_pow3 * zc_pow2; - phi_tmp[544 + i] = S0[i] * A; - - // Gradient AM=6 Component=XYYYZZ - phi_x_tmp[544 + i] = SX * A; - phi_y_tmp[544 + i] = SY * A; - phi_z_tmp[544 + i] = SZ * A; - AX = yc_pow3 * zc_pow2; - phi_x_tmp[544 + i] += S0[i] * AX; - AY = 3.0 * xc[i] * yc_pow2 * zc_pow2; - phi_y_tmp[544 + i] += S0[i] * AY; - AZ = 2.0 * xc[i] * yc_pow3 * zc[i]; - phi_z_tmp[544 + i] += S0[i] * AZ; - - // Density AM=6 Component=XYYZZZ - A = xc[i] * yc_pow2 * zc_pow3; - phi_tmp[576 + i] = S0[i] * A; - - // Gradient AM=6 Component=XYYZZZ - phi_x_tmp[576 + i] = SX * A; - phi_y_tmp[576 + i] = SY * A; - phi_z_tmp[576 + i] = SZ * A; - AX = yc_pow2 * zc_pow3; - phi_x_tmp[576 + i] += S0[i] * AX; - AY = 2.0 * xc[i] * yc[i] * zc_pow3; - phi_y_tmp[576 + i] += S0[i] * AY; - AZ = 3.0 * xc[i] * yc_pow2 * zc_pow2; - phi_z_tmp[576 + i] += S0[i] * AZ; - - // Density AM=6 Component=XYZZZZ - A = xc[i] * yc[i] * zc_pow4; - phi_tmp[608 + i] = S0[i] * A; - - // Gradient AM=6 Component=XYZZZZ - phi_x_tmp[608 + i] = SX * A; - phi_y_tmp[608 + i] = SY * A; - phi_z_tmp[608 + i] = SZ * A; - AX = yc[i] * zc_pow4; - phi_x_tmp[608 + i] += S0[i] * AX; - AY = xc[i] * zc_pow4; - phi_y_tmp[608 + i] += S0[i] * AY; - AZ = 4.0 * xc[i] * yc[i] * zc_pow3; - phi_z_tmp[608 + i] += S0[i] * AZ; - - // Density AM=6 Component=XZZZZZ - A = xc[i] * zc_pow5; - phi_tmp[640 + i] = S0[i] * A; - - // Gradient AM=6 Component=XZZZZZ - phi_x_tmp[640 + i] = SX * A; - phi_y_tmp[640 + i] = SY * A; - phi_z_tmp[640 + i] = SZ * A; - phi_x_tmp[640 + i] += S0[i] * zc_pow5; - AZ = 5.0 * xc[i] * zc_pow4; - phi_z_tmp[640 + i] += S0[i] * AZ; - - // Density AM=6 Component=YYYYYY - phi_tmp[672 + i] = S0[i] * yc_pow6; - - // Gradient AM=6 Component=YYYYYY - phi_x_tmp[672 + i] = SX * yc_pow6; - phi_y_tmp[672 + i] = SY * yc_pow6; - phi_z_tmp[672 + i] = SZ * yc_pow6; - AY = 6.0 * yc_pow5; - phi_y_tmp[672 + i] += S0[i] * AY; - - // Density AM=6 Component=YYYYYZ - A = yc_pow5 * zc[i]; - phi_tmp[704 + i] = S0[i] * A; - - // Gradient AM=6 Component=YYYYYZ - phi_x_tmp[704 + i] = SX * A; - phi_y_tmp[704 + i] = SY * A; - phi_z_tmp[704 + i] = SZ * A; - AY = 5.0 * yc_pow4 * zc[i]; - phi_y_tmp[704 + i] += S0[i] * AY; - phi_z_tmp[704 + i] += S0[i] * yc_pow5; - - // Density AM=6 Component=YYYYZZ - A = yc_pow4 * zc_pow2; - phi_tmp[736 + i] = S0[i] * A; - - // Gradient AM=6 Component=YYYYZZ - phi_x_tmp[736 + i] = SX * A; - phi_y_tmp[736 + i] = SY * A; - phi_z_tmp[736 + i] = SZ * A; - AY = 4.0 * yc_pow3 * zc_pow2; - phi_y_tmp[736 + i] += S0[i] * AY; - AZ = 2.0 * yc_pow4 * zc[i]; - phi_z_tmp[736 + i] += S0[i] * AZ; - - // Density AM=6 Component=YYYZZZ - A = yc_pow3 * zc_pow3; - phi_tmp[768 + i] = S0[i] * A; - - // Gradient AM=6 Component=YYYZZZ - phi_x_tmp[768 + i] = SX * A; - phi_y_tmp[768 + i] = SY * A; - phi_z_tmp[768 + i] = SZ * A; - AY = 3.0 * yc_pow2 * zc_pow3; - phi_y_tmp[768 + i] += S0[i] * AY; - AZ = 3.0 * yc_pow3 * zc_pow2; - phi_z_tmp[768 + i] += S0[i] * AZ; - - // Density AM=6 Component=YYZZZZ - A = yc_pow2 * zc_pow4; - phi_tmp[800 + i] = S0[i] * A; - - // Gradient AM=6 Component=YYZZZZ - phi_x_tmp[800 + i] = SX * A; - phi_y_tmp[800 + i] = SY * A; - phi_z_tmp[800 + i] = SZ * A; - AY = 2.0 * yc[i] * zc_pow4; - phi_y_tmp[800 + i] += S0[i] * AY; - AZ = 4.0 * yc_pow2 * zc_pow3; - phi_z_tmp[800 + i] += S0[i] * AZ; - - // Density AM=6 Component=YZZZZZ - A = yc[i] * zc_pow5; - phi_tmp[832 + i] = S0[i] * A; - - // Gradient AM=6 Component=YZZZZZ - phi_x_tmp[832 + i] = SX * A; - phi_y_tmp[832 + i] = SY * A; - phi_z_tmp[832 + i] = SZ * A; - phi_y_tmp[832 + i] += S0[i] * zc_pow5; - AZ = 5.0 * yc[i] * zc_pow4; - phi_z_tmp[832 + i] += S0[i] * AZ; - - // Density AM=6 Component=ZZZZZZ - phi_tmp[864 + i] = S0[i] * zc_pow6; - - // Gradient AM=6 Component=ZZZZZZ - phi_x_tmp[864 + i] = SX * zc_pow6; - phi_y_tmp[864 + i] = SY * zc_pow6; - phi_z_tmp[864 + i] = SZ * zc_pow6; - AZ = 6.0 * zc_pow5; - phi_z_tmp[864 + i] += S0[i] * AZ; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L6(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L6(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L6(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L6(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L6(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L6(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L6(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L6(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L6(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L6(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L6(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L6(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - -} diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv2.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv2.c deleted file mode 100644 index 822f62a..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv2.c +++ /dev/null @@ -1,4549 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - - - -void gg_collocation_L0_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 1; - const unsigned long nspherical = 1; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - double* PRAGMA_RESTRICT phi_xx_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xx_tmp, 64); - double* PRAGMA_RESTRICT phi_xy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xy_tmp, 64); - double* PRAGMA_RESTRICT phi_xz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xz_tmp, 64); - double* PRAGMA_RESTRICT phi_yy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yy_tmp, 64); - double* PRAGMA_RESTRICT phi_yz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yz_tmp, 64); - double* PRAGMA_RESTRICT phi_zz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_zz_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Gaussians derivs (Hessians) - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - phi_out[start + i] = S0[i]; - - // Gradient AM=0 Component=0 - phi_x_out[start + i] = SX; - phi_y_out[start + i] = SY; - phi_z_out[start + i] = SZ; - - // Hessian AM=0 Component=0 - phi_xx_out[start + i] = SXX; - phi_yy_out[start + i] = SYY; - phi_zz_out[start + i] = SZZ; - phi_xy_out[start + i] = SXY; - phi_xz_out[start + i] = SXZ; - phi_yz_out[start + i] = SYZ; - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - ALIGNED_FREE(phi_xx_tmp); - ALIGNED_FREE(phi_xy_tmp); - ALIGNED_FREE(phi_xz_tmp); - ALIGNED_FREE(phi_yy_tmp); - ALIGNED_FREE(phi_yz_tmp); - ALIGNED_FREE(phi_zz_tmp); - -} - -void gg_collocation_L1_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 3; - const unsigned long nspherical = 3; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - double* PRAGMA_RESTRICT phi_xx_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xx_tmp, 64); - double* PRAGMA_RESTRICT phi_xy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xy_tmp, 64); - double* PRAGMA_RESTRICT phi_xz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xz_tmp, 64); - double* PRAGMA_RESTRICT phi_yy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yy_tmp, 64); - double* PRAGMA_RESTRICT phi_yz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yz_tmp, 64); - double* PRAGMA_RESTRICT phi_zz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_zz_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Gaussians derivs (Hessians) - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - // Density AM=1 Component=X - phi_tmp[i] = S0[i] * xc[i]; - - // Gradient AM=1 Component=X - phi_x_tmp[i] = SX * xc[i]; - phi_y_tmp[i] = SY * xc[i]; - phi_z_tmp[i] = SZ * xc[i]; - phi_x_tmp[i] += S0[i]; - - // Hessian AM=1 Component=X - phi_xx_tmp[i] = SXX * xc[i]; - phi_xx_tmp[i] += SX; - phi_xx_tmp[i] += SX; - phi_yy_tmp[i] = SYY * xc[i]; - phi_zz_tmp[i] = SZZ * xc[i]; - phi_xy_tmp[i] = SXY * xc[i]; - phi_xy_tmp[i] += SY; - phi_xz_tmp[i] = SXZ * xc[i]; - phi_xz_tmp[i] += SZ; - phi_yz_tmp[i] = SYZ * xc[i]; - - // Density AM=1 Component=Y - phi_tmp[32 + i] = S0[i] * yc[i]; - - // Gradient AM=1 Component=Y - phi_x_tmp[32 + i] = SX * yc[i]; - phi_y_tmp[32 + i] = SY * yc[i]; - phi_z_tmp[32 + i] = SZ * yc[i]; - phi_y_tmp[32 + i] += S0[i]; - - // Hessian AM=1 Component=Y - phi_xx_tmp[32 + i] = SXX * yc[i]; - phi_yy_tmp[32 + i] = SYY * yc[i]; - phi_yy_tmp[32 + i] += SY; - phi_yy_tmp[32 + i] += SY; - phi_zz_tmp[32 + i] = SZZ * yc[i]; - phi_xy_tmp[32 + i] = SXY * yc[i]; - phi_xy_tmp[32 + i] += SX; - phi_xz_tmp[32 + i] = SXZ * yc[i]; - phi_yz_tmp[32 + i] = SYZ * yc[i]; - phi_yz_tmp[32 + i] += SZ; - - // Density AM=1 Component=Z - phi_tmp[64 + i] = S0[i] * zc[i]; - - // Gradient AM=1 Component=Z - phi_x_tmp[64 + i] = SX * zc[i]; - phi_y_tmp[64 + i] = SY * zc[i]; - phi_z_tmp[64 + i] = SZ * zc[i]; - phi_z_tmp[64 + i] += S0[i]; - - // Hessian AM=1 Component=Z - phi_xx_tmp[64 + i] = SXX * zc[i]; - phi_yy_tmp[64 + i] = SYY * zc[i]; - phi_zz_tmp[64 + i] = SZZ * zc[i]; - phi_zz_tmp[64 + i] += SZ; - phi_zz_tmp[64 + i] += SZ; - phi_xy_tmp[64 + i] = SXY * zc[i]; - phi_xz_tmp[64 + i] = SXZ * zc[i]; - phi_xz_tmp[64 + i] += SX; - phi_yz_tmp[64 + i] = SYZ * zc[i]; - phi_yz_tmp[64 + i] += SY; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - ALIGNED_FREE(phi_xx_tmp); - ALIGNED_FREE(phi_xy_tmp); - ALIGNED_FREE(phi_xz_tmp); - ALIGNED_FREE(phi_yy_tmp); - ALIGNED_FREE(phi_yz_tmp); - ALIGNED_FREE(phi_zz_tmp); - -} - -void gg_collocation_L2_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 6; - const unsigned long nspherical = 5; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - double* PRAGMA_RESTRICT phi_xx_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_xx_tmp, 64); - double* PRAGMA_RESTRICT phi_xy_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_xy_tmp, 64); - double* PRAGMA_RESTRICT phi_xz_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_xz_tmp, 64); - double* PRAGMA_RESTRICT phi_yy_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_yy_tmp, 64); - double* PRAGMA_RESTRICT phi_yz_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_yz_tmp, 64); - double* PRAGMA_RESTRICT phi_zz_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_zz_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Gaussians derivs (Hessians) - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - - // Density AM=2 Component=XX - phi_tmp[i] = S0[i] * xc_pow2; - - // Gradient AM=2 Component=XX - phi_x_tmp[i] = SX * xc_pow2; - phi_y_tmp[i] = SY * xc_pow2; - phi_z_tmp[i] = SZ * xc_pow2; - AX = 2.0 * xc[i]; - phi_x_tmp[i] += S0[i] * AX; - - // Hessian AM=2 Component=XX - phi_xx_tmp[i] = SXX * xc_pow2; - phi_xx_tmp[i] += SX * AX; - phi_xx_tmp[i] += SX * AX; - phi_xx_tmp[i] += 2.0 * S0[i]; - phi_yy_tmp[i] = SYY * xc_pow2; - phi_zz_tmp[i] = SZZ * xc_pow2; - phi_xy_tmp[i] = SXY * xc_pow2; - phi_xy_tmp[i] += SY * AX; - phi_xz_tmp[i] = SXZ * xc_pow2; - phi_xz_tmp[i] += SZ * AX; - phi_yz_tmp[i] = SYZ * xc_pow2; - - // Density AM=2 Component=XY - A = xc[i] * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Gradient AM=2 Component=XY - phi_x_tmp[32 + i] = SX * A; - phi_y_tmp[32 + i] = SY * A; - phi_z_tmp[32 + i] = SZ * A; - phi_x_tmp[32 + i] += S0[i] * yc[i]; - phi_y_tmp[32 + i] += S0[i] * xc[i]; - - // Hessian AM=2 Component=XY - phi_xx_tmp[32 + i] = SXX * A; - phi_xx_tmp[32 + i] += SX * yc[i]; - phi_xx_tmp[32 + i] += SX * yc[i]; - phi_yy_tmp[32 + i] = SYY * A; - phi_yy_tmp[32 + i] += SY * xc[i]; - phi_yy_tmp[32 + i] += SY * xc[i]; - phi_zz_tmp[32 + i] = SZZ * A; - phi_xy_tmp[32 + i] = SXY * A; - phi_xy_tmp[32 + i] += SX * xc[i]; - phi_xy_tmp[32 + i] += SY * yc[i]; - phi_xy_tmp[32 + i] += S0[i]; - phi_xz_tmp[32 + i] = SXZ * A; - phi_xz_tmp[32 + i] += SZ * yc[i]; - phi_yz_tmp[32 + i] = SYZ * A; - phi_yz_tmp[32 + i] += SZ * xc[i]; - - // Density AM=2 Component=XZ - A = xc[i] * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Gradient AM=2 Component=XZ - phi_x_tmp[64 + i] = SX * A; - phi_y_tmp[64 + i] = SY * A; - phi_z_tmp[64 + i] = SZ * A; - phi_x_tmp[64 + i] += S0[i] * zc[i]; - phi_z_tmp[64 + i] += S0[i] * xc[i]; - - // Hessian AM=2 Component=XZ - phi_xx_tmp[64 + i] = SXX * A; - phi_xx_tmp[64 + i] += SX * zc[i]; - phi_xx_tmp[64 + i] += SX * zc[i]; - phi_yy_tmp[64 + i] = SYY * A; - phi_zz_tmp[64 + i] = SZZ * A; - phi_zz_tmp[64 + i] += SZ * xc[i]; - phi_zz_tmp[64 + i] += SZ * xc[i]; - phi_xy_tmp[64 + i] = SXY * A; - phi_xy_tmp[64 + i] += SY * zc[i]; - phi_xz_tmp[64 + i] = SXZ * A; - phi_xz_tmp[64 + i] += SX * xc[i]; - phi_xz_tmp[64 + i] += SZ * zc[i]; - phi_xz_tmp[64 + i] += S0[i]; - phi_yz_tmp[64 + i] = SYZ * A; - phi_yz_tmp[64 + i] += SY * xc[i]; - - // Density AM=2 Component=YY - phi_tmp[96 + i] = S0[i] * yc_pow2; - - // Gradient AM=2 Component=YY - phi_x_tmp[96 + i] = SX * yc_pow2; - phi_y_tmp[96 + i] = SY * yc_pow2; - phi_z_tmp[96 + i] = SZ * yc_pow2; - AY = 2.0 * yc[i]; - phi_y_tmp[96 + i] += S0[i] * AY; - - // Hessian AM=2 Component=YY - phi_xx_tmp[96 + i] = SXX * yc_pow2; - phi_yy_tmp[96 + i] = SYY * yc_pow2; - phi_yy_tmp[96 + i] += SY * AY; - phi_yy_tmp[96 + i] += SY * AY; - phi_yy_tmp[96 + i] += 2.0 * S0[i]; - phi_zz_tmp[96 + i] = SZZ * yc_pow2; - phi_xy_tmp[96 + i] = SXY * yc_pow2; - phi_xy_tmp[96 + i] += SX * AY; - phi_xz_tmp[96 + i] = SXZ * yc_pow2; - phi_yz_tmp[96 + i] = SYZ * yc_pow2; - phi_yz_tmp[96 + i] += SZ * AY; - - // Density AM=2 Component=YZ - A = yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Gradient AM=2 Component=YZ - phi_x_tmp[128 + i] = SX * A; - phi_y_tmp[128 + i] = SY * A; - phi_z_tmp[128 + i] = SZ * A; - phi_y_tmp[128 + i] += S0[i] * zc[i]; - phi_z_tmp[128 + i] += S0[i] * yc[i]; - - // Hessian AM=2 Component=YZ - phi_xx_tmp[128 + i] = SXX * A; - phi_yy_tmp[128 + i] = SYY * A; - phi_yy_tmp[128 + i] += SY * zc[i]; - phi_yy_tmp[128 + i] += SY * zc[i]; - phi_zz_tmp[128 + i] = SZZ * A; - phi_zz_tmp[128 + i] += SZ * yc[i]; - phi_zz_tmp[128 + i] += SZ * yc[i]; - phi_xy_tmp[128 + i] = SXY * A; - phi_xy_tmp[128 + i] += SX * zc[i]; - phi_xz_tmp[128 + i] = SXZ * A; - phi_xz_tmp[128 + i] += SX * yc[i]; - phi_yz_tmp[128 + i] = SYZ * A; - phi_yz_tmp[128 + i] += SY * yc[i]; - phi_yz_tmp[128 + i] += SZ * zc[i]; - phi_yz_tmp[128 + i] += S0[i]; - - // Density AM=2 Component=ZZ - phi_tmp[160 + i] = S0[i] * zc_pow2; - - // Gradient AM=2 Component=ZZ - phi_x_tmp[160 + i] = SX * zc_pow2; - phi_y_tmp[160 + i] = SY * zc_pow2; - phi_z_tmp[160 + i] = SZ * zc_pow2; - AZ = 2.0 * zc[i]; - phi_z_tmp[160 + i] += S0[i] * AZ; - - // Hessian AM=2 Component=ZZ - phi_xx_tmp[160 + i] = SXX * zc_pow2; - phi_yy_tmp[160 + i] = SYY * zc_pow2; - phi_zz_tmp[160 + i] = SZZ * zc_pow2; - phi_zz_tmp[160 + i] += SZ * AZ; - phi_zz_tmp[160 + i] += SZ * AZ; - phi_zz_tmp[160 + i] += 2.0 * S0[i]; - phi_xy_tmp[160 + i] = SXY * zc_pow2; - phi_xz_tmp[160 + i] = SXZ * zc_pow2; - phi_xz_tmp[160 + i] += SX * AZ; - phi_yz_tmp[160 + i] = SYZ * zc_pow2; - phi_yz_tmp[160 + i] += SY * AZ; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_cca_cart_to_spherical_L2(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_cca_cart_to_spherical_L2(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_gaussian_cart_to_spherical_L2(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_gaussian_cart_to_spherical_L2(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_cca_cart_copy_L2(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_cca_cart_copy_L2(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L2(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_molden_cart_copy_L2(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_molden_cart_copy_L2(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - ALIGNED_FREE(phi_xx_tmp); - ALIGNED_FREE(phi_xy_tmp); - ALIGNED_FREE(phi_xz_tmp); - ALIGNED_FREE(phi_yy_tmp); - ALIGNED_FREE(phi_yz_tmp); - ALIGNED_FREE(phi_zz_tmp); - -} - -void gg_collocation_L3_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 10; - const unsigned long nspherical = 7; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 64 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 64 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 64 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[32 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc[i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc[i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = zc_pow[32 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SX; - phi_tmp[i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SX; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SX; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXX; - phi_tmp[i] += 6.0 * xc_pow[i] * SX; - phi_tmp[i] += 6.0 * xc[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXX; - phi_tmp[32 + i] += 4.0 * xc[i] * yc[i] * SX; - phi_tmp[32 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXX; - phi_tmp[64 + i] += 4.0 * xc[i] * zc[i] * SX; - phi_tmp[64 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * SX; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * SX; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXY; - phi_tmp[i] += 3.0 * xc_pow[i] * SY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[i] * SX; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SY; - phi_tmp[32 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXY; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SX; - phi_tmp[96 + i] += yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc[i] * zc[i] * SX; - phi_tmp[128 + i] += yc[i] * zc[i] * SY; - phi_tmp[128 + i] += zc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += zc_pow[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += zc_pow[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXZ; - phi_tmp[i] += 3.0 * xc_pow[i] * SZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXZ; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[i] * SX; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SZ; - phi_tmp[64 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SX; - phi_tmp[128 + i] += yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += yc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SX; - phi_tmp[160 + i] += zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += yc_pow[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[i] * SY; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc[i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * yc[i] * S0[i]; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * SY; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SY; - phi_tmp[128 + i] += xc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[32 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[i] * SZ; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc[i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * SZ; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * zc[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L4_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 15; - const unsigned long nspherical = 9; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - xc_pow[64 + i] = xc_pow[32 + i] * xc[i]; - yc_pow[64 + i] = yc_pow[32 + i] * yc[i]; - zc_pow[64 + i] = zc_pow[32 + i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[64 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[320 + i] = yc_pow[64 + i] * S0[i]; - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[448 + i] = zc_pow[64 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SX; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SX; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SX; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[32 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[32 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXX; - phi_tmp[i] += 8.0 * xc_pow[32 + i] * SX; - phi_tmp[i] += 12.0 * xc_pow[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 6.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[32 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 6.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[64 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 4.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 4.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * SX; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * SX; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXY; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[32 + i] * SX; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 4.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[192 + i] += yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXZ; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SX; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 4.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SX; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[288 + i] += zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[32 + i] * SY; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 8.0 * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 4.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[32 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[64 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[32 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 4.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 8.0 * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * zc_pow[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L5_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 21; - const unsigned long nspherical = 11; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 128 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 128 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 128 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - xc_pow[64 + i] = xc_pow[32 + i] * xc[i]; - yc_pow[64 + i] = yc_pow[32 + i] * yc[i]; - zc_pow[64 + i] = zc_pow[32 + i] * zc[i]; - xc_pow[96 + i] = xc_pow[64 + i] * xc[i]; - yc_pow[96 + i] = yc_pow[64 + i] * yc[i]; - zc_pow[96 + i] = zc_pow[64 + i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[96 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * S0[i]; - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[480 + i] = yc_pow[96 + i] * S0[i]; - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * S0[i]; - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * S0[i]; - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[640 + i] = zc_pow[96 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SX; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SX; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SX; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SX; - phi_tmp[320 + i] += yc_pow[64 + i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SX; - phi_tmp[448 + i] += zc_pow[64 + i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[64 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SY; - phi_tmp[608 + i] += zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[64 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[512 + i] += yc_pow[64 + i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXX; - phi_tmp[i] += 10.0 * xc_pow[64 + i] * SX; - phi_tmp[i] += 20.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 8.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[32 + i] += 12.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 8.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[64 + i] += 12.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 6.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 6.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 6.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXX; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * SX; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXX; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * SX; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXY; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[64 + i] * SX; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 4.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[320 + i] += yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXY; - phi_tmp[448 + i] += zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[608 + i] += zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXZ; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SX; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SX; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXZ; - phi_tmp[320 + i] += yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SX; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[448 + i] += zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[64 + i] * SY; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 8.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYY; - phi_tmp[480 + i] += 10.0 * yc_pow[64 + i] * SY; - phi_tmp[480 + i] += 20.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 8.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[512 + i] += 12.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 6.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[608 + i] += 2.0 * zc_pow[64 + i] * SY; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[64 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[32 + i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYZ; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[608 + i] += zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[96 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[64 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 8.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SZZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * SZ; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 8.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SZZ; - phi_tmp[640 + i] += 10.0 * zc_pow[64 + i] * SZ; - phi_tmp[640 + i] += 20.0 * zc_pow[32 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L6_deriv2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 28; - const unsigned long nspherical = 13; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 256 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 160 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 160 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 160 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - xc_pow[64 + i] = xc_pow[32 + i] * xc[i]; - yc_pow[64 + i] = yc_pow[32 + i] * yc[i]; - zc_pow[64 + i] = zc_pow[32 + i] * zc[i]; - xc_pow[96 + i] = xc_pow[64 + i] * xc[i]; - yc_pow[96 + i] = yc_pow[64 + i] * yc[i]; - zc_pow[96 + i] = zc_pow[64 + i] * zc[i]; - xc_pow[128 + i] = xc_pow[96 + i] * xc[i]; - yc_pow[128 + i] = yc_pow[96 + i] * yc[i]; - zc_pow[128 + i] = zc_pow[96 + i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[128 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * S0[i]; - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * S0[i]; - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * S0[i]; - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * S0[i]; - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * S0[i]; - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * S0[i]; - phi_tmp[672 + i] = yc_pow[128 + i] * S0[i]; - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * S0[i]; - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * S0[i]; - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * S0[i]; - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * S0[i]; - phi_tmp[864 + i] = zc_pow[128 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SX; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SX; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SX; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SX; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SX; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SX; - phi_tmp[480 + i] += yc_pow[96 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SX; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SX; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SX; - phi_tmp[640 + i] += zc_pow[96 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[96 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SY; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SY; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * S0[i]; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SY; - phi_tmp[832 + i] += zc_pow[96 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[96 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SZ; - phi_tmp[704 + i] += yc_pow[96 + i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXX; - phi_tmp[i] += 12.0 * xc_pow[96 + i] * SX; - phi_tmp[i] += 30.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 10.0 * xc_pow[64 + i] * yc[i] * SX; - phi_tmp[32 + i] += 20.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 10.0 * xc_pow[64 + i] * zc[i] * SX; - phi_tmp[64 + i] += 20.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 8.0 * xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 12.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 8.0 * xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 12.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 8.0 * xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 12.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 6.0 * xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 6.0 * xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXX; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[64 + i] * SX; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[352 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXX; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[64 + i] * SX; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXX; - phi_tmp[480 + i] += 2.0 * yc_pow[96 + i] * SX; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXX; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[576 + i] += 2.0 * yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXX; - phi_tmp[608 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXX; - phi_tmp[640 + i] += 2.0 * zc_pow[96 + i] * SX; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXY; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[96 + i] * SX; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SY; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SX; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 8.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 9.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 8.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXY; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXY; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SX; - phi_tmp[480 + i] += yc_pow[96 + i] * SY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SX; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SY; - phi_tmp[608 + i] += zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXY; - phi_tmp[640 + i] += zc_pow[96 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXY; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXY; - phi_tmp[832 + i] += zc_pow[96 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXZ; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SX; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SX; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 8.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 9.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXZ; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 8.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXZ; - phi_tmp[480 + i] += yc_pow[96 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SX; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[512 + i] += yc_pow[64 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SX; - phi_tmp[640 + i] += zc_pow[96 + i] * SZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[96 + i] * SY; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc_pow[64 + i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 8.0 * xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYY; - phi_tmp[480 + i] += 10.0 * xc[i] * yc_pow[64 + i] * SY; - phi_tmp[480 + i] += 20.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 8.0 * xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[512 + i] += 12.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[608 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYY; - phi_tmp[672 + i] += 12.0 * yc_pow[96 + i] * SY; - phi_tmp[672 + i] += 30.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYY; - phi_tmp[704 + i] += 10.0 * yc_pow[64 + i] * zc[i] * SY; - phi_tmp[704 + i] += 20.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYY; - phi_tmp[736 + i] += 8.0 * yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[736 + i] += 12.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYY; - phi_tmp[768 + i] += 6.0 * yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[768 + i] += 6.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYY; - phi_tmp[800 + i] += 4.0 * yc[i] * zc_pow[64 + i] * SY; - phi_tmp[800 + i] += 2.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYY; - phi_tmp[832 + i] += 2.0 * zc_pow[96 + i] * SY; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[96 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[64 + i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYZ; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYZ; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[736 + i] += 8.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[768 + i] += 9.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[800 + i] += 8.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SY; - phi_tmp[832 + i] += zc_pow[96 + i] * SZ; - phi_tmp[832 + i] += 5.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[128 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[96 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 8.0 * xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SZZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 8.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SZZ; - phi_tmp[640 + i] += 10.0 * xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[640 + i] += 20.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SZZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SZZ; - phi_tmp[704 + i] += 2.0 * yc_pow[96 + i] * SZ; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SZZ; - phi_tmp[736 + i] += 4.0 * yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SZZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SZZ; - phi_tmp[800 + i] += 8.0 * yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[800 + i] += 12.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SZZ; - phi_tmp[832 + i] += 10.0 * yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[832 + i] += 20.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SZZ; - phi_tmp[864 + i] += 12.0 * zc_pow[96 + i] * SZ; - phi_tmp[864 + i] += 30.0 * zc_pow[64 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv3.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv3.c deleted file mode 100644 index c38afc7..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_deriv3.c +++ /dev/null @@ -1,9321 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - - - -void gg_collocation_L0_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 1; - const unsigned long nspherical = 1; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - double* PRAGMA_RESTRICT phi_xx_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xx_tmp, 64); - double* PRAGMA_RESTRICT phi_xy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xy_tmp, 64); - double* PRAGMA_RESTRICT phi_xz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xz_tmp, 64); - double* PRAGMA_RESTRICT phi_yy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yy_tmp, 64); - double* PRAGMA_RESTRICT phi_yz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yz_tmp, 64); - double* PRAGMA_RESTRICT phi_zz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_zz_tmp, 64); - double* PRAGMA_RESTRICT phi_xxx_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xxx_tmp, 64); - double* PRAGMA_RESTRICT phi_xxy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xxy_tmp, 64); - double* PRAGMA_RESTRICT phi_xxz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xxz_tmp, 64); - double* PRAGMA_RESTRICT phi_xyy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xyy_tmp, 64); - double* PRAGMA_RESTRICT phi_xyz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xyz_tmp, 64); - double* PRAGMA_RESTRICT phi_xzz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_xzz_tmp, 64); - double* PRAGMA_RESTRICT phi_yyy_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yyy_tmp, 64); - double* PRAGMA_RESTRICT phi_yyz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yyz_tmp, 64); - double* PRAGMA_RESTRICT phi_yzz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_yzz_tmp, 64); - double* PRAGMA_RESTRICT phi_zzz_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_zzz_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Gaussians derivs (Hessians) - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - // Gaussians 3rd derivs) - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + yc[i] * S2[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + zc[i] * S2[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + xc[i] * S2[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + xc[i] * S2[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + zc[i] * S2[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + yc[i] * S2[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - phi_out[start + i] = S0[i]; - - // Gradient AM=0 Component=0 - phi_x_out[start + i] = SX; - phi_y_out[start + i] = SY; - phi_z_out[start + i] = SZ; - - // Hessian AM=0 Component=0 - phi_xx_out[start + i] = SXX; - phi_yy_out[start + i] = SYY; - phi_zz_out[start + i] = SZZ; - phi_xy_out[start + i] = SXY; - phi_xz_out[start + i] = SXZ; - phi_yz_out[start + i] = SYZ; - - // Der3 AM=0 Component=0 - phi_xxx_out[start + i] = SXXX; - phi_xxy_out[start + i] = SXXY; - phi_xxz_out[start + i] = SXXZ; - phi_xyy_out[start + i] = SXYY; - phi_xyz_out[start + i] = SXYZ; - phi_xzz_out[start + i] = SXZZ; - phi_yyy_out[start + i] = SYYY; - phi_yyz_out[start + i] = SYYZ; - phi_yzz_out[start + i] = SYZZ; - phi_zzz_out[start + i] = SZZZ; - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - ALIGNED_FREE(phi_xx_tmp); - ALIGNED_FREE(phi_xy_tmp); - ALIGNED_FREE(phi_xz_tmp); - ALIGNED_FREE(phi_yy_tmp); - ALIGNED_FREE(phi_yz_tmp); - ALIGNED_FREE(phi_zz_tmp); - ALIGNED_FREE(phi_xxx_tmp); - ALIGNED_FREE(phi_xxy_tmp); - ALIGNED_FREE(phi_xxz_tmp); - ALIGNED_FREE(phi_xyy_tmp); - ALIGNED_FREE(phi_xyz_tmp); - ALIGNED_FREE(phi_xzz_tmp); - ALIGNED_FREE(phi_yyy_tmp); - ALIGNED_FREE(phi_yyz_tmp); - ALIGNED_FREE(phi_yzz_tmp); - ALIGNED_FREE(phi_zzz_tmp); - -} - -void gg_collocation_L1_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 3; - const unsigned long nspherical = 3; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - double* PRAGMA_RESTRICT phi_x_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_x_tmp, 64); - double* PRAGMA_RESTRICT phi_y_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_y_tmp, 64); - double* PRAGMA_RESTRICT phi_z_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_z_tmp, 64); - double* PRAGMA_RESTRICT phi_xx_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xx_tmp, 64); - double* PRAGMA_RESTRICT phi_xy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xy_tmp, 64); - double* PRAGMA_RESTRICT phi_xz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xz_tmp, 64); - double* PRAGMA_RESTRICT phi_yy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yy_tmp, 64); - double* PRAGMA_RESTRICT phi_yz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yz_tmp, 64); - double* PRAGMA_RESTRICT phi_zz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_zz_tmp, 64); - double* PRAGMA_RESTRICT phi_xxx_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xxx_tmp, 64); - double* PRAGMA_RESTRICT phi_xxy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xxy_tmp, 64); - double* PRAGMA_RESTRICT phi_xxz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xxz_tmp, 64); - double* PRAGMA_RESTRICT phi_xyy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xyy_tmp, 64); - double* PRAGMA_RESTRICT phi_xyz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xyz_tmp, 64); - double* PRAGMA_RESTRICT phi_xzz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_xzz_tmp, 64); - double* PRAGMA_RESTRICT phi_yyy_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yyy_tmp, 64); - double* PRAGMA_RESTRICT phi_yyz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yyz_tmp, 64); - double* PRAGMA_RESTRICT phi_yzz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_yzz_tmp, 64); - double* PRAGMA_RESTRICT phi_zzz_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_zzz_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - // Gaussian derivs (gradients) - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - - // Gaussians derivs (Hessians) - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - // Gaussians 3rd derivs) - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + yc[i] * S2[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + zc[i] * S2[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + xc[i] * S2[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + xc[i] * S2[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + zc[i] * S2[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + yc[i] * S2[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - - // Density AM=1 Component=X - phi_tmp[i] = S0[i] * xc[i]; - - // Gradient AM=1 Component=X - phi_x_tmp[i] = SX * xc[i]; - phi_y_tmp[i] = SY * xc[i]; - phi_z_tmp[i] = SZ * xc[i]; - phi_x_tmp[i] += S0[i]; - - // Hessian AM=1 Component=X - phi_xx_tmp[i] = SXX * xc[i]; - phi_xx_tmp[i] += SX; - phi_xx_tmp[i] += SX; - phi_yy_tmp[i] = SYY * xc[i]; - phi_zz_tmp[i] = SZZ * xc[i]; - phi_xy_tmp[i] = SXY * xc[i]; - phi_xy_tmp[i] += SY; - phi_xz_tmp[i] = SXZ * xc[i]; - phi_xz_tmp[i] += SZ; - phi_yz_tmp[i] = SYZ * xc[i]; - phi_xyz_tmp[i] = SXYZ * xc[i]; - phi_xyz_tmp[i] += SYZ; - phi_xxy_tmp[i] = SXXY * xc[i]; - phi_xxy_tmp[i] += 2.0 * SXY; - phi_xxz_tmp[i] = SXXZ * xc[i]; - phi_xxz_tmp[i] += 2.0 * SXZ; - phi_xyy_tmp[i] = SXYY * xc[i]; - phi_xyy_tmp[i] += SYY; - phi_xzz_tmp[i] = SXZZ * xc[i]; - phi_xzz_tmp[i] += SZZ; - phi_yyz_tmp[i] = SYYZ * xc[i]; - phi_yzz_tmp[i] = SYZZ * xc[i]; - phi_xxx_tmp[i] = SXXX * xc[i]; - phi_xxx_tmp[i] += 3.0 * SXX; - phi_yyy_tmp[i] = SYYY * xc[i]; - phi_zzz_tmp[i] = SZZZ * xc[i]; - - - // Density AM=1 Component=Y - phi_tmp[32 + i] = S0[i] * yc[i]; - - // Gradient AM=1 Component=Y - phi_x_tmp[32 + i] = SX * yc[i]; - phi_y_tmp[32 + i] = SY * yc[i]; - phi_z_tmp[32 + i] = SZ * yc[i]; - phi_y_tmp[32 + i] += S0[i]; - - // Hessian AM=1 Component=Y - phi_xx_tmp[32 + i] = SXX * yc[i]; - phi_yy_tmp[32 + i] = SYY * yc[i]; - phi_yy_tmp[32 + i] += SY; - phi_yy_tmp[32 + i] += SY; - phi_zz_tmp[32 + i] = SZZ * yc[i]; - phi_xy_tmp[32 + i] = SXY * yc[i]; - phi_xy_tmp[32 + i] += SX; - phi_xz_tmp[32 + i] = SXZ * yc[i]; - phi_yz_tmp[32 + i] = SYZ * yc[i]; - phi_yz_tmp[32 + i] += SZ; - phi_xyz_tmp[32 + i] = SXYZ * yc[i]; - phi_xyz_tmp[32 + i] += SXZ; - phi_xxy_tmp[32 + i] = SXXY * yc[i]; - phi_xxy_tmp[32 + i] += SXX; - phi_xxz_tmp[32 + i] = SXXZ * yc[i]; - phi_xyy_tmp[32 + i] = SXYY * yc[i]; - phi_xyy_tmp[32 + i] += 2.0 * SXY; - phi_xzz_tmp[32 + i] = SXZZ * yc[i]; - phi_yyz_tmp[32 + i] = SYYZ * yc[i]; - phi_yyz_tmp[32 + i] += 2.0 * SYZ; - phi_yzz_tmp[32 + i] = SYZZ * yc[i]; - phi_yzz_tmp[32 + i] += SZZ; - phi_xxx_tmp[32 + i] = SXXX * yc[i]; - phi_yyy_tmp[32 + i] = SYYY * yc[i]; - phi_yyy_tmp[32 + i] += 3.0 * SYY; - phi_zzz_tmp[32 + i] = SZZZ * yc[i]; - - - // Density AM=1 Component=Z - phi_tmp[64 + i] = S0[i] * zc[i]; - - // Gradient AM=1 Component=Z - phi_x_tmp[64 + i] = SX * zc[i]; - phi_y_tmp[64 + i] = SY * zc[i]; - phi_z_tmp[64 + i] = SZ * zc[i]; - phi_z_tmp[64 + i] += S0[i]; - - // Hessian AM=1 Component=Z - phi_xx_tmp[64 + i] = SXX * zc[i]; - phi_yy_tmp[64 + i] = SYY * zc[i]; - phi_zz_tmp[64 + i] = SZZ * zc[i]; - phi_zz_tmp[64 + i] += SZ; - phi_zz_tmp[64 + i] += SZ; - phi_xy_tmp[64 + i] = SXY * zc[i]; - phi_xz_tmp[64 + i] = SXZ * zc[i]; - phi_xz_tmp[64 + i] += SX; - phi_yz_tmp[64 + i] = SYZ * zc[i]; - phi_yz_tmp[64 + i] += SY; - phi_xyz_tmp[64 + i] = SXYZ * zc[i]; - phi_xyz_tmp[64 + i] += SXY; - phi_xxy_tmp[64 + i] = SXXY * zc[i]; - phi_xxz_tmp[64 + i] = SXXZ * zc[i]; - phi_xxz_tmp[64 + i] += SXX; - phi_xyy_tmp[64 + i] = SXYY * zc[i]; - phi_xzz_tmp[64 + i] = SXZZ * zc[i]; - phi_xzz_tmp[64 + i] += 2.0 * SXZ; - phi_yyz_tmp[64 + i] = SYYZ * zc[i]; - phi_yyz_tmp[64 + i] += SYY; - phi_yzz_tmp[64 + i] = SYZZ * zc[i]; - phi_yzz_tmp[64 + i] += 2.0 * SYZ; - phi_xxx_tmp[64 + i] = SXXX * zc[i]; - phi_yyy_tmp[64 + i] = SYYY * zc[i]; - phi_zzz_tmp[64 + i] = SZZZ * zc[i]; - phi_zzz_tmp[64 + i] += 3.0 * SZZ; - - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xxx_tmp, 32, (phi_xxx_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xxy_tmp, 32, (phi_xxy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xxz_tmp, 32, (phi_xxz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xyy_tmp, 32, (phi_xyy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xyz_tmp, 32, (phi_xyz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_xzz_tmp, 32, (phi_xzz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yyy_tmp, 32, (phi_yyy_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yyz_tmp, 32, (phi_yyz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_yzz_tmp, 32, (phi_yzz_out + start), npoints); - gg_cca_cart_to_spherical_L1(remain, phi_zzz_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xxx_tmp, 32, (phi_xxx_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xxy_tmp, 32, (phi_xxy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xxz_tmp, 32, (phi_xxz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xyy_tmp, 32, (phi_xyy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xyz_tmp, 32, (phi_xyz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_xzz_tmp, 32, (phi_xzz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yyy_tmp, 32, (phi_yyy_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yyz_tmp, 32, (phi_yyz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_yzz_tmp, 32, (phi_yzz_out + start), npoints); - gg_gaussian_cart_to_spherical_L1(remain, phi_zzz_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xxx_tmp, 32, (phi_xxx_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xxy_tmp, 32, (phi_xxy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xxz_tmp, 32, (phi_xxz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xyy_tmp, 32, (phi_xyy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xyz_tmp, 32, (phi_xyz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_xzz_tmp, 32, (phi_xzz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yyy_tmp, 32, (phi_yyy_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yyz_tmp, 32, (phi_yyz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_yzz_tmp, 32, (phi_yzz_out + start), npoints); - gg_cca_cart_copy_L1(remain, phi_zzz_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - - // Gradient, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_x_tmp, 32, (phi_x_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_y_tmp, 32, (phi_y_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_z_tmp, 32, (phi_z_out + start), npoints); - - // Hessian, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_xx_tmp, 32, (phi_xx_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xy_tmp, 32, (phi_xy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xz_tmp, 32, (phi_xz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yy_tmp, 32, (phi_yy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yz_tmp, 32, (phi_yz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_zz_tmp, 32, (phi_zz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xxx_tmp, 32, (phi_xxx_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xxy_tmp, 32, (phi_xxy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xxz_tmp, 32, (phi_xxz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xyy_tmp, 32, (phi_xyy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xyz_tmp, 32, (phi_xyz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_xzz_tmp, 32, (phi_xzz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yyy_tmp, 32, (phi_yyy_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yyz_tmp, 32, (phi_yyz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_yzz_tmp, 32, (phi_yzz_out + start), npoints); - gg_molden_cart_copy_L1(remain, phi_zzz_tmp, 32, (phi_zzz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - ALIGNED_FREE(phi_x_tmp); - ALIGNED_FREE(phi_y_tmp); - ALIGNED_FREE(phi_z_tmp); - ALIGNED_FREE(phi_xx_tmp); - ALIGNED_FREE(phi_xy_tmp); - ALIGNED_FREE(phi_xz_tmp); - ALIGNED_FREE(phi_yy_tmp); - ALIGNED_FREE(phi_yz_tmp); - ALIGNED_FREE(phi_zz_tmp); - ALIGNED_FREE(phi_xxx_tmp); - ALIGNED_FREE(phi_xxy_tmp); - ALIGNED_FREE(phi_xxz_tmp); - ALIGNED_FREE(phi_xyy_tmp); - ALIGNED_FREE(phi_xyz_tmp); - ALIGNED_FREE(phi_xzz_tmp); - ALIGNED_FREE(phi_yyy_tmp); - ALIGNED_FREE(phi_yyz_tmp); - ALIGNED_FREE(phi_yzz_tmp); - ALIGNED_FREE(phi_zzz_tmp); - -} - -void gg_collocation_L2_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 6; - const unsigned long nspherical = 5; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[i] * S0[i]; - phi_tmp[32 + i] = xc[i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc[i] * zc[i] * S0[i]; - phi_tmp[96 + i] = yc_pow[i] * S0[i]; - phi_tmp[128 + i] = yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = zc_pow[i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[i] * SX; - phi_tmp[i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[32 + i] = xc[i] * yc[i] * SX; - phi_tmp[32 + i] += yc[i] * S0[i]; - - phi_tmp[64 + i] = xc[i] * zc[i] * SX; - phi_tmp[64 + i] += zc[i] * S0[i]; - - phi_tmp[96 + i] = yc_pow[i] * SX; - - phi_tmp[128 + i] = yc[i] * zc[i] * SX; - - phi_tmp[160 + i] = zc_pow[i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[i] * SY; - - phi_tmp[32 + i] = xc[i] * yc[i] * SY; - phi_tmp[32 + i] += xc[i] * S0[i]; - - phi_tmp[64 + i] = xc[i] * zc[i] * SY; - - phi_tmp[96 + i] = yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[128 + i] = yc[i] * zc[i] * SY; - phi_tmp[128 + i] += zc[i] * S0[i]; - - phi_tmp[160 + i] = zc_pow[i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[i] * SZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SZ; - phi_tmp[64 + i] += xc[i] * S0[i]; - - phi_tmp[96 + i] = yc_pow[i] * SZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += yc[i] * S0[i]; - - phi_tmp[160 + i] = zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * zc[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[i] * SXX; - phi_tmp[i] += 4.0 * xc[i] * SX; - phi_tmp[i] += 2.0 * S0[i]; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXX; - phi_tmp[32 + i] += 2.0 * yc[i] * SX; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXX; - phi_tmp[64 + i] += 2.0 * zc[i] * SX; - - phi_tmp[96 + i] = yc_pow[i] * SXX; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXX; - - phi_tmp[160 + i] = zc_pow[i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[i] * SXY; - phi_tmp[i] += 2.0 * xc[i] * SY; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXY; - phi_tmp[32 + i] += xc[i] * SX; - phi_tmp[32 + i] += yc[i] * SY; - phi_tmp[32 + i] += 1 * S0[i]; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXY; - phi_tmp[64 + i] += zc[i] * SY; - - phi_tmp[96 + i] = yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * yc[i] * SX; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += zc[i] * SX; - - phi_tmp[160 + i] = zc_pow[i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[i] * SXZ; - phi_tmp[i] += 2.0 * xc[i] * SZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXZ; - phi_tmp[32 + i] += yc[i] * SZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc[i] * SX; - phi_tmp[64 + i] += zc[i] * SZ; - phi_tmp[64 + i] += 1 * S0[i]; - - phi_tmp[96 + i] = yc_pow[i] * SXZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += yc[i] * SX; - - phi_tmp[160 + i] = zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * zc[i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[i] * SYY; - - phi_tmp[32 + i] = xc[i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc[i] * SY; - - phi_tmp[64 + i] = xc[i] * zc[i] * SYY; - - phi_tmp[96 + i] = yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * S0[i]; - - phi_tmp[128 + i] = yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * zc[i] * SY; - - phi_tmp[160 + i] = zc_pow[i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[i] * SYZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc[i] * SZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc[i] * SY; - - phi_tmp[96 + i] = yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * yc[i] * SZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += yc[i] * SY; - phi_tmp[128 + i] += zc[i] * SZ; - phi_tmp[128 + i] += 1 * S0[i]; - - phi_tmp[160 + i] = zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * zc[i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[i] * SZZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc[i] * SZ; - - phi_tmp[96 + i] = yc_pow[i] * SZZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * yc[i] * SZ; - - phi_tmp[160 + i] = zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - // Combine XXX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - - phi_tmp[i] = xc_pow[i] * SXXX; - phi_tmp[i] += 3.0 * 2.0 * xc[i] * SXX; - phi_tmp[i] += 3.0 * 2.0 * SX; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXXX; - phi_tmp[32 + i] += 3.0 * yc[i] * SXX; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXXX; - phi_tmp[64 + i] += 3.0 * zc[i] * SXX; - - phi_tmp[96 + i] = yc_pow[i] * SXXX; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXXX; - - phi_tmp[160 + i] = zc_pow[i] * SXXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } - - // Combine XXY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[i] * SXXY; - phi_tmp[i] += 2.0 * 2.0 * xc[i] * SXY; - phi_tmp[i] += 2.0 * SY; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXXY; - phi_tmp[32 + i] += 2.0 * yc[i] * SXY; - phi_tmp[32 + i] += xc[i] * SXX; - phi_tmp[32 + i] += 2.0 * 1 * SX; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXXY; - phi_tmp[64 + i] += 2.0 * zc[i] * SXY; - - phi_tmp[96 + i] = yc_pow[i] * SXXY; - phi_tmp[96 + i] += 2.0 * yc[i] * SXX; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXXY; - phi_tmp[128 + i] += zc[i] * SXX; - - phi_tmp[160 + i] = zc_pow[i] * SXXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } - - // Combine XXZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[i] * SXXZ; - phi_tmp[i] += 2.0 * 2.0 * xc[i] * SXZ; - phi_tmp[i] += 2.0 * SZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXXZ; - phi_tmp[32 + i] += 2.0 * yc[i] * SXZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXXZ; - phi_tmp[64 + i] += 2.0 * zc[i] * SXZ; - phi_tmp[64 + i] += xc[i] * SXX; - phi_tmp[64 + i] += 2.0 * 1 * SX; - - phi_tmp[96 + i] = yc_pow[i] * SXXZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXXZ; - phi_tmp[128 + i] += yc[i] * SXX; - - phi_tmp[160 + i] = zc_pow[i] * SXXZ; - phi_tmp[160 + i] += 2.0 * zc[i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } - - // Combine XYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[i] * SXYY; - phi_tmp[i] += 2.0 * xc[i] * SYY; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXYY; - phi_tmp[32 + i] += 2.0 * xc[i] * SXY; - phi_tmp[32 + i] += yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * 1 * SY; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXYY; - phi_tmp[64 + i] += zc[i] * SYY; - - phi_tmp[96 + i] = yc_pow[i] * SXYY; - phi_tmp[96 + i] += 2.0 * 2.0 * yc[i] * SXY; - phi_tmp[96 + i] += 2.0 * SX; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXYY; - phi_tmp[128 + i] += 2.0 * zc[i] * SXY; - - phi_tmp[160 + i] = zc_pow[i] * SXYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } - - // Combine XYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[i] * SXYZ; - phi_tmp[i] += 2.0 * xc[i] * SYZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXYZ; - phi_tmp[32 + i] += yc[i] * SYZ; - phi_tmp[32 + i] += xc[i] * SXZ; - phi_tmp[32 + i] += 1 * SZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXYZ; - phi_tmp[64 + i] += zc[i] * SYZ; - phi_tmp[64 + i] += xc[i] * SXY; - phi_tmp[64 + i] += 1 * SY; - - phi_tmp[96 + i] = yc_pow[i] * SXYZ; - phi_tmp[96 + i] += 2.0 * yc[i] * SXZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXYZ; - phi_tmp[128 + i] += zc[i] * SXZ; - phi_tmp[128 + i] += yc[i] * SXY; - phi_tmp[128 + i] += 1 * SX; - - phi_tmp[160 + i] = zc_pow[i] * SXYZ; - phi_tmp[160 + i] += 2.0 * zc[i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } - - // Combine XZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[i] * SXZZ; - phi_tmp[i] += 2.0 * xc[i] * SZZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SXZZ; - phi_tmp[32 + i] += yc[i] * SZZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SXZZ; - phi_tmp[64 + i] += 2.0 * xc[i] * SXZ; - phi_tmp[64 + i] += zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * 1 * SZ; - - phi_tmp[96 + i] = yc_pow[i] * SXZZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SXZZ; - phi_tmp[128 + i] += 2.0 * yc[i] * SXZ; - - phi_tmp[160 + i] = zc_pow[i] * SXZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * zc[i] * SXZ; - phi_tmp[160 + i] += 2.0 * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } - - // Combine YYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - - phi_tmp[i] = xc_pow[i] * SYYY; - - phi_tmp[32 + i] = xc[i] * yc[i] * SYYY; - phi_tmp[32 + i] += 3.0 * xc[i] * SYY; - - phi_tmp[64 + i] = xc[i] * zc[i] * SYYY; - - phi_tmp[96 + i] = yc_pow[i] * SYYY; - phi_tmp[96 + i] += 3.0 * 2.0 * yc[i] * SYY; - phi_tmp[96 + i] += 3.0 * 2.0 * SY; - - phi_tmp[128 + i] = yc[i] * zc[i] * SYYY; - phi_tmp[128 + i] += 3.0 * zc[i] * SYY; - - phi_tmp[160 + i] = zc_pow[i] * SYYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } - - // Combine YYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[i] * SYYZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SYYZ; - phi_tmp[32 + i] += 2.0 * xc[i] * SYZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SYYZ; - phi_tmp[64 + i] += xc[i] * SYY; - - phi_tmp[96 + i] = yc_pow[i] * SYYZ; - phi_tmp[96 + i] += 2.0 * 2.0 * yc[i] * SYZ; - phi_tmp[96 + i] += 2.0 * SZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SYYZ; - phi_tmp[128 + i] += 2.0 * zc[i] * SYZ; - phi_tmp[128 + i] += yc[i] * SYY; - phi_tmp[128 + i] += 2.0 * 1 * SY; - - phi_tmp[160 + i] = zc_pow[i] * SYYZ; - phi_tmp[160 + i] += 2.0 * zc[i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } - - // Combine YZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[i] * SYZZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SYZZ; - phi_tmp[32 + i] += xc[i] * SZZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SYZZ; - phi_tmp[64 + i] += 2.0 * xc[i] * SYZ; - - phi_tmp[96 + i] = yc_pow[i] * SYZZ; - phi_tmp[96 + i] += 2.0 * yc[i] * SZZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SYZZ; - phi_tmp[128 + i] += 2.0 * yc[i] * SYZ; - phi_tmp[128 + i] += zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * 1 * SZ; - - phi_tmp[160 + i] = zc_pow[i] * SYZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * zc[i] * SYZ; - phi_tmp[160 + i] += 2.0 * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } - - // Combine ZZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - - phi_tmp[i] = xc_pow[i] * SZZZ; - - phi_tmp[32 + i] = xc[i] * yc[i] * SZZZ; - - phi_tmp[64 + i] = xc[i] * zc[i] * SZZZ; - phi_tmp[64 + i] += 3.0 * xc[i] * SZZ; - - phi_tmp[96 + i] = yc_pow[i] * SZZZ; - - phi_tmp[128 + i] = yc[i] * zc[i] * SZZZ; - phi_tmp[128 + i] += 3.0 * yc[i] * SZZ; - - phi_tmp[160 + i] = zc_pow[i] * SZZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * zc[i] * SZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * SZ; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L3_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 10; - const unsigned long nspherical = 7; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 64 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 64 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 64 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[32 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc[i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc[i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = zc_pow[32 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SX; - phi_tmp[i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SX; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SX; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXX; - phi_tmp[i] += 6.0 * xc_pow[i] * SX; - phi_tmp[i] += 6.0 * xc[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXX; - phi_tmp[32 + i] += 4.0 * xc[i] * yc[i] * SX; - phi_tmp[32 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXX; - phi_tmp[64 + i] += 4.0 * xc[i] * zc[i] * SX; - phi_tmp[64 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * SX; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * SX; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXY; - phi_tmp[i] += 3.0 * xc_pow[i] * SY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[i] * SX; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SY; - phi_tmp[32 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXY; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SX; - phi_tmp[96 + i] += yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc[i] * zc[i] * SX; - phi_tmp[128 + i] += yc[i] * zc[i] * SY; - phi_tmp[128 + i] += zc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += zc_pow[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += zc_pow[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXZ; - phi_tmp[i] += 3.0 * xc_pow[i] * SZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXZ; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[i] * SX; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SZ; - phi_tmp[64 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SX; - phi_tmp[128 + i] += yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += yc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SX; - phi_tmp[160 + i] += zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += yc_pow[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[i] * SY; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc[i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * yc[i] * S0[i]; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * SY; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SY; - phi_tmp[128 + i] += xc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc[i] * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[32 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[i] * SZ; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc[i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * SZ; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * zc[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - // Combine XXX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXXX; - phi_tmp[i] += 3.0 * 3.0 * xc_pow[i] * SXX; - phi_tmp[i] += 3.0 * 6.0 * xc[i] * SX; - phi_tmp[i] += 6.0 * S0[i]; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXXX; - phi_tmp[32 + i] += 3.0 * 2.0 * xc[i] * yc[i] * SXX; - phi_tmp[32 + i] += 3.0 * 2.0 * yc[i] * SX; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXXX; - phi_tmp[64 + i] += 3.0 * 2.0 * xc[i] * zc[i] * SXX; - phi_tmp[64 + i] += 3.0 * 2.0 * zc[i] * SX; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXXX; - phi_tmp[96 + i] += 3.0 * yc_pow[i] * SXX; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXXX; - phi_tmp[128 + i] += 3.0 * yc[i] * zc[i] * SXX; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXXX; - phi_tmp[160 + i] += 3.0 * zc_pow[i] * SXX; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXXX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXXX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXXX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } - - // Combine XXY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXXY; - phi_tmp[i] += 2.0 * 3.0 * xc_pow[i] * SXY; - phi_tmp[i] += 6.0 * xc[i] * SY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXXY; - phi_tmp[32 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[i] * SXX; - phi_tmp[32 + i] += 2.0 * yc[i] * SY; - phi_tmp[32 + i] += 2.0 * 2.0 * xc[i] * SX; - phi_tmp[32 + i] += 2.0 * S0[i]; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXXY; - phi_tmp[64 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SXY; - phi_tmp[64 + i] += 2.0 * zc[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXXY; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SXX; - phi_tmp[96 + i] += 2.0 * 2.0 * yc[i] * SX; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXXY; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 2.0 * zc[i] * SX; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXXY; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * SXY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXXY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SXX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXXY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SXX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXXY; - phi_tmp[256 + i] += zc_pow[i] * SXX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } - - // Combine XXZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXXZ; - phi_tmp[i] += 2.0 * 3.0 * xc_pow[i] * SXZ; - phi_tmp[i] += 6.0 * xc[i] * SZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXXZ; - phi_tmp[32 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SXZ; - phi_tmp[32 + i] += 2.0 * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXXZ; - phi_tmp[64 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[i] * SXX; - phi_tmp[64 + i] += 2.0 * zc[i] * SZ; - phi_tmp[64 + i] += 2.0 * 2.0 * xc[i] * SX; - phi_tmp[64 + i] += 2.0 * S0[i]; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXXZ; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * SXZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXXZ; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SXX; - phi_tmp[128 + i] += 2.0 * yc[i] * SX; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXXZ; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SXX; - phi_tmp[160 + i] += 2.0 * 2.0 * zc[i] * SX; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXXZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXXZ; - phi_tmp[224 + i] += yc_pow[i] * SXX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXXZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SXX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXXZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } - - // Combine XYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXYY; - phi_tmp[i] += 3.0 * xc_pow[i] * SYY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXYY; - phi_tmp[32 + i] += 2.0 * xc_pow[i] * SXY; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * 2.0 * xc[i] * SY; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXYY; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXYY; - phi_tmp[96 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SXY; - phi_tmp[96 + i] += yc_pow[i] * SYY; - phi_tmp[96 + i] += 2.0 * xc[i] * SX; - phi_tmp[96 + i] += 2.0 * 2.0 * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * S0[i]; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXYY; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * SXY; - phi_tmp[128 + i] += yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * zc[i] * SY; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXYY; - phi_tmp[160 + i] += zc_pow[i] * SYY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXYY; - phi_tmp[192 + i] += 2.0 * 3.0 * yc_pow[i] * SXY; - phi_tmp[192 + i] += 6.0 * yc[i] * SX; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXYY; - phi_tmp[224 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * zc[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXYY; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * SXY; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } - - // Combine XYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXYZ; - phi_tmp[i] += 3.0 * xc_pow[i] * SYZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXYZ; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[i] * SXZ; - phi_tmp[32 + i] += 2.0 * xc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXYZ; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[i] * SXY; - phi_tmp[64 + i] += 2.0 * xc[i] * SY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXYZ; - phi_tmp[96 + i] += yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SXZ; - phi_tmp[96 + i] += 2.0 * yc[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXYZ; - phi_tmp[128 + i] += yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SXY; - phi_tmp[128 + i] += zc[i] * SZ; - phi_tmp[128 + i] += yc[i] * SY; - phi_tmp[128 + i] += xc[i] * SX; - phi_tmp[128 + i] += 1 * S0[i]; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXYZ; - phi_tmp[160 + i] += zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SXY; - phi_tmp[160 + i] += 2.0 * zc[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXYZ; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SXZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXYZ; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SXZ; - phi_tmp[224 + i] += yc_pow[i] * SXY; - phi_tmp[224 + i] += 2.0 * yc[i] * SX; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXYZ; - phi_tmp[256 + i] += zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SXY; - phi_tmp[256 + i] += 2.0 * zc[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXYZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } - - // Combine XZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SXZZ; - phi_tmp[i] += 3.0 * xc_pow[i] * SZZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SXZZ; - phi_tmp[32 + i] += 2.0 * xc[i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SXZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[i] * SXZ; - phi_tmp[64 + i] += 2.0 * xc[i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * 2.0 * xc[i] * SZ; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SXZZ; - phi_tmp[96 + i] += yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SXZZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * SXZ; - phi_tmp[128 + i] += yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * yc[i] * SZ; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SXZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SXZ; - phi_tmp[160 + i] += zc_pow[i] * SZZ; - phi_tmp[160 + i] += 2.0 * xc[i] * SX; - phi_tmp[160 + i] += 2.0 * 2.0 * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * S0[i]; - - phi_tmp[192 + i] = yc_pow[32 + i] * SXZZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SXZZ; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * SXZ; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SXZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SXZ; - phi_tmp[256 + i] += 2.0 * yc[i] * SX; - - phi_tmp[288 + i] = zc_pow[32 + i] * SXZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * zc_pow[i] * SXZ; - phi_tmp[288 + i] += 6.0 * zc[i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } - - // Combine YYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYYY; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYYY; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * SYY; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYYY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc[i] * yc[i] * SYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc[i] * SY; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYYY; - phi_tmp[128 + i] += 3.0 * xc[i] * zc[i] * SYY; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYYY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYYY; - phi_tmp[192 + i] += 3.0 * 3.0 * yc_pow[i] * SYY; - phi_tmp[192 + i] += 3.0 * 6.0 * yc[i] * SY; - phi_tmp[192 + i] += 6.0 * S0[i]; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYYY; - phi_tmp[224 + i] += 3.0 * 2.0 * yc[i] * zc[i] * SYY; - phi_tmp[224 + i] += 3.0 * 2.0 * zc[i] * SY; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYYY; - phi_tmp[256 + i] += 3.0 * zc_pow[i] * SYY; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } - - // Combine YYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYYZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYYZ; - phi_tmp[32 + i] += 2.0 * xc_pow[i] * SYZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYYZ; - phi_tmp[64 + i] += xc_pow[i] * SYY; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYYZ; - phi_tmp[96 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc[i] * SZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYYZ; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc[i] * yc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc[i] * SY; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYYZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc[i] * SYY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYYZ; - phi_tmp[192 + i] += 2.0 * 3.0 * yc_pow[i] * SYZ; - phi_tmp[192 + i] += 6.0 * yc[i] * SZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYYZ; - phi_tmp[224 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SYZ; - phi_tmp[224 + i] += yc_pow[i] * SYY; - phi_tmp[224 + i] += 2.0 * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * 2.0 * yc[i] * SY; - phi_tmp[224 + i] += 2.0 * S0[i]; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYYZ; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SYY; - phi_tmp[256 + i] += 2.0 * 2.0 * zc[i] * SY; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYYZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } - - // Combine YZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[32 + i] * SYZZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SYZZ; - phi_tmp[32 + i] += xc_pow[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SYZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[i] * SYZ; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SYZZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc[i] * SZZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SYZZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * SYZ; - phi_tmp[128 + i] += xc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc[i] * SZ; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SYZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc[i] * SY; - - phi_tmp[192 + i] = yc_pow[32 + i] * SYZZ; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SZZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SYZZ; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * SYZ; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * 2.0 * yc[i] * SZ; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SYZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SYZ; - phi_tmp[256 + i] += zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * yc[i] * SY; - phi_tmp[256 + i] += 2.0 * 2.0 * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * S0[i]; - - phi_tmp[288 + i] = zc_pow[32 + i] * SYZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * zc_pow[i] * SYZ; - phi_tmp[288 + i] += 6.0 * zc[i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } - - // Combine ZZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - - phi_tmp[i] = xc_pow[32 + i] * SZZZ; - - phi_tmp[32 + i] = xc_pow[i] * yc[i] * SZZZ; - - phi_tmp[64 + i] = xc_pow[i] * zc[i] * SZZZ; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * SZZ; - - phi_tmp[96 + i] = xc[i] * yc_pow[i] * SZZZ; - - phi_tmp[128 + i] = xc[i] * yc[i] * zc[i] * SZZZ; - phi_tmp[128 + i] += 3.0 * xc[i] * yc[i] * SZZ; - - phi_tmp[160 + i] = xc[i] * zc_pow[i] * SZZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc[i] * zc[i] * SZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc[i] * SZ; - - phi_tmp[192 + i] = yc_pow[32 + i] * SZZZ; - - phi_tmp[224 + i] = yc_pow[i] * zc[i] * SZZZ; - phi_tmp[224 + i] += 3.0 * yc_pow[i] * SZZ; - - phi_tmp[256 + i] = yc[i] * zc_pow[i] * SZZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * yc[i] * zc[i] * SZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * yc[i] * SZ; - - phi_tmp[288 + i] = zc_pow[32 + i] * SZZZ; - phi_tmp[288 + i] += 3.0 * 3.0 * zc_pow[i] * SZZ; - phi_tmp[288 + i] += 3.0 * 6.0 * zc[i] * SZ; - phi_tmp[288 + i] += 6.0 * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L4_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 15; - const unsigned long nspherical = 9; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - xc_pow[64 + i] = xc_pow[32 + i] * xc[i]; - yc_pow[64 + i] = yc_pow[32 + i] * yc[i]; - zc_pow[64 + i] = zc_pow[32 + i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[64 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[320 + i] = yc_pow[64 + i] * S0[i]; - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[448 + i] = zc_pow[64 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SX; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SX; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SX; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[32 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[32 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXX; - phi_tmp[i] += 8.0 * xc_pow[32 + i] * SX; - phi_tmp[i] += 12.0 * xc_pow[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 6.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[32 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 6.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[64 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 4.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 4.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * SX; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * SX; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXY; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[32 + i] * SX; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 4.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[192 + i] += yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXZ; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SX; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 4.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SX; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[288 + i] += zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[32 + i] * SY; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 8.0 * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 4.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[32 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[64 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[32 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 4.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 8.0 * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * zc_pow[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - // Combine XXX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXXX; - phi_tmp[i] += 3.0 * 4.0 * xc_pow[32 + i] * SXX; - phi_tmp[i] += 3.0 * 12.0 * xc_pow[i] * SX; - phi_tmp[i] += 24.0 * xc[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXXX; - phi_tmp[32 + i] += 3.0 * 3.0 * xc_pow[i] * yc[i] * SXX; - phi_tmp[32 + i] += 3.0 * 6.0 * xc[i] * yc[i] * SX; - phi_tmp[32 + i] += 6.0 * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXXX; - phi_tmp[64 + i] += 3.0 * 3.0 * xc_pow[i] * zc[i] * SXX; - phi_tmp[64 + i] += 3.0 * 6.0 * xc[i] * zc[i] * SX; - phi_tmp[64 + i] += 6.0 * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXXX; - phi_tmp[96 + i] += 3.0 * 2.0 * xc[i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 3.0 * 2.0 * yc_pow[i] * SX; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXXX; - phi_tmp[128 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 3.0 * 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXXX; - phi_tmp[160 + i] += 3.0 * 2.0 * xc[i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 3.0 * 2.0 * zc_pow[i] * SX; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXXX; - phi_tmp[192 + i] += 3.0 * yc_pow[32 + i] * SXX; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXXX; - phi_tmp[224 + i] += 3.0 * yc_pow[i] * zc[i] * SXX; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXXX; - phi_tmp[256 + i] += 3.0 * yc[i] * zc_pow[i] * SXX; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXXX; - phi_tmp[288 + i] += 3.0 * zc_pow[32 + i] * SXX; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXXX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXXX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXXX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXXX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } - - // Combine XXY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXXY; - phi_tmp[i] += 2.0 * 4.0 * xc_pow[32 + i] * SXY; - phi_tmp[i] += 12.0 * xc_pow[i] * SY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXXY; - phi_tmp[32 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[32 + i] * SXX; - phi_tmp[32 + i] += 6.0 * xc[i] * yc[i] * SY; - phi_tmp[32 + i] += 2.0 * 3.0 * xc_pow[i] * SX; - phi_tmp[32 + i] += 6.0 * xc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXXY; - phi_tmp[64 + i] += 2.0 * 3.0 * xc_pow[i] * zc[i] * SXY; - phi_tmp[64 + i] += 6.0 * xc[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXXY; - phi_tmp[96 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SXX; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * 4.0 * xc[i] * yc[i] * SX; - phi_tmp[96 + i] += 4.0 * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXXY; - phi_tmp[128 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SXX; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SX; - phi_tmp[128 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXXY; - phi_tmp[160 + i] += 2.0 * 2.0 * xc[i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXXY; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SXX; - phi_tmp[192 + i] += 2.0 * 3.0 * yc_pow[i] * SX; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXXY; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SXX; - phi_tmp[224 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXXY; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * SX; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXXY; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * SXY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXXY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SXX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXXY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SXX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXXY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SXX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXXY; - phi_tmp[416 + i] += zc_pow[32 + i] * SXX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } - - // Combine XXZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXXZ; - phi_tmp[i] += 2.0 * 4.0 * xc_pow[32 + i] * SXZ; - phi_tmp[i] += 12.0 * xc_pow[i] * SZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXXZ; - phi_tmp[32 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * SXZ; - phi_tmp[32 + i] += 6.0 * xc[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXXZ; - phi_tmp[64 + i] += 2.0 * 3.0 * xc_pow[i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SXX; - phi_tmp[64 + i] += 6.0 * xc[i] * zc[i] * SZ; - phi_tmp[64 + i] += 2.0 * 3.0 * xc_pow[i] * SX; - phi_tmp[64 + i] += 6.0 * xc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXXZ; - phi_tmp[96 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 2.0 * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXXZ; - phi_tmp[128 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SXX; - phi_tmp[128 + i] += 2.0 * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SX; - phi_tmp[128 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXXZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc[i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SXX; - phi_tmp[160 + i] += 2.0 * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * 4.0 * xc[i] * zc[i] * SX; - phi_tmp[160 + i] += 4.0 * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXXZ; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * SXZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXXZ; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SXX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * SX; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXXZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SXX; - phi_tmp[256 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SX; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SXX; - phi_tmp[288 + i] += 2.0 * 3.0 * zc_pow[i] * SX; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXXZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXXZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SXX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXXZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SXX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SXX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXXZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } - - // Combine XYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXYY; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXYY; - phi_tmp[32 + i] += 2.0 * xc_pow[32 + i] * SXY; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * 3.0 * xc_pow[i] * SY; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXYY; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXYY; - phi_tmp[96 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * SX; - phi_tmp[96 + i] += 2.0 * 4.0 * xc[i] * yc[i] * SY; - phi_tmp[96 + i] += 4.0 * xc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXYY; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * zc[i] * SXY; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXYY; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXYY; - phi_tmp[192 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * SXY; - phi_tmp[192 + i] += yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc[i] * SX; - phi_tmp[192 + i] += 2.0 * 3.0 * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXYY; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc[i] * SXY; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 2.0 * xc[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXYY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXYY; - phi_tmp[288 + i] += zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXYY; - phi_tmp[320 + i] += 2.0 * 4.0 * yc_pow[32 + i] * SXY; - phi_tmp[320 + i] += 12.0 * yc_pow[i] * SX; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXYY; - phi_tmp[352 + i] += 2.0 * 3.0 * yc_pow[i] * zc[i] * SXY; - phi_tmp[352 + i] += 6.0 * yc[i] * zc[i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXYY; - phi_tmp[384 + i] += 2.0 * 2.0 * yc[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * zc_pow[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXYY; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * SXY; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } - - // Combine XYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXYZ; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXYZ; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[32 + i] * SXZ; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * SZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXYZ; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SXY; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * SY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXYZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SXZ; - phi_tmp[96 + i] += 4.0 * xc[i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXYZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SXY; - phi_tmp[128 + i] += 2.0 * xc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[i] * SX; - phi_tmp[128 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXYZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SXY; - phi_tmp[160 + i] += 4.0 * xc[i] * zc[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXYZ; - phi_tmp[192 + i] += yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SXZ; - phi_tmp[192 + i] += 3.0 * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXYZ; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SXY; - phi_tmp[224 + i] += 2.0 * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * SX; - phi_tmp[224 + i] += 4.0 * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXYZ; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SXY; - phi_tmp[256 + i] += zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc[i] * SX; - phi_tmp[256 + i] += zc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[288 + i] += zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SXY; - phi_tmp[288 + i] += 3.0 * zc_pow[i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXYZ; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SXZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXYZ; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SXZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SXY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * SX; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXYZ; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SXY; - phi_tmp[384 + i] += 4.0 * yc[i] * zc[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[416 + i] += zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SXY; - phi_tmp[416 + i] += 3.0 * zc_pow[i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXYZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } - - // Combine XZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SXZZ; - phi_tmp[i] += 4.0 * xc_pow[32 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SXZZ; - phi_tmp[32 + i] += 3.0 * xc_pow[i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SXZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[32 + i] * SXZ; - phi_tmp[64 + i] += 3.0 * xc_pow[i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * 3.0 * xc_pow[i] * SZ; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SXZZ; - phi_tmp[96 + i] += 2.0 * xc[i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SXZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * yc[i] * SXZ; - phi_tmp[128 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SXZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc_pow[i] * zc[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc[i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * SX; - phi_tmp[160 + i] += 2.0 * 4.0 * xc[i] * zc[i] * SZ; - phi_tmp[160 + i] += 4.0 * xc[i] * S0[i]; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SXZZ; - phi_tmp[192 + i] += yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SXZZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * SXZ; - phi_tmp[224 + i] += yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SXZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc[i] * SXZ; - phi_tmp[256 + i] += yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * SX; - phi_tmp[256 + i] += 2.0 * 2.0 * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc[i] * zc_pow[i] * SXZ; - phi_tmp[288 + i] += zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc[i] * SX; - phi_tmp[288 + i] += 2.0 * 3.0 * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * zc[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SXZZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SXZZ; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * SXZ; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SXZZ; - phi_tmp[384 + i] += 2.0 * 2.0 * yc_pow[i] * zc[i] * SXZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * SX; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[416 + i] += 2.0 * 3.0 * yc[i] * zc_pow[i] * SXZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc[i] * SX; - - phi_tmp[448 + i] = zc_pow[64 + i] * SXZZ; - phi_tmp[448 + i] += 2.0 * 4.0 * zc_pow[32 + i] * SXZ; - phi_tmp[448 + i] += 12.0 * zc_pow[i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } - - // Combine YYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYYY; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYYY; - phi_tmp[32 + i] += 3.0 * xc_pow[32 + i] * SYY; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYYY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc_pow[i] * yc[i] * SYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc_pow[i] * SY; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYYY; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * zc[i] * SYY; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYYY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYYY; - phi_tmp[192 + i] += 3.0 * 3.0 * xc[i] * yc_pow[i] * SYY; - phi_tmp[192 + i] += 3.0 * 6.0 * xc[i] * yc[i] * SY; - phi_tmp[192 + i] += 6.0 * xc[i] * S0[i]; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYYY; - phi_tmp[224 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc[i] * SYY; - phi_tmp[224 + i] += 3.0 * 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYYY; - phi_tmp[256 + i] += 3.0 * xc[i] * zc_pow[i] * SYY; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYYY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYYY; - phi_tmp[320 + i] += 3.0 * 4.0 * yc_pow[32 + i] * SYY; - phi_tmp[320 + i] += 3.0 * 12.0 * yc_pow[i] * SY; - phi_tmp[320 + i] += 24.0 * yc[i] * S0[i]; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYYY; - phi_tmp[352 + i] += 3.0 * 3.0 * yc_pow[i] * zc[i] * SYY; - phi_tmp[352 + i] += 3.0 * 6.0 * yc[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * zc[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYYY; - phi_tmp[384 + i] += 3.0 * 2.0 * yc[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 3.0 * 2.0 * zc_pow[i] * SY; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYYY; - phi_tmp[416 + i] += 3.0 * zc_pow[32 + i] * SYY; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } - - // Combine YYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYYZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYYZ; - phi_tmp[32 + i] += 2.0 * xc_pow[32 + i] * SYZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYYZ; - phi_tmp[64 + i] += xc_pow[32 + i] * SYY; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYYZ; - phi_tmp[96 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYYZ; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[i] * yc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * SY; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * zc[i] * SYY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYYZ; - phi_tmp[192 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * SYZ; - phi_tmp[192 + i] += 6.0 * xc[i] * yc[i] * SZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYYZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc[i] * yc_pow[i] * SYY; - phi_tmp[224 + i] += 2.0 * xc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYYZ; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SYY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SY; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[288 + i] += 3.0 * xc[i] * zc_pow[i] * SYY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYYZ; - phi_tmp[320 + i] += 2.0 * 4.0 * yc_pow[32 + i] * SYZ; - phi_tmp[320 + i] += 12.0 * yc_pow[i] * SZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYYZ; - phi_tmp[352 + i] += 2.0 * 3.0 * yc_pow[i] * zc[i] * SYZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SYY; - phi_tmp[352 + i] += 6.0 * yc[i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * 3.0 * yc_pow[i] * SY; - phi_tmp[352 + i] += 6.0 * yc[i] * S0[i]; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYYZ; - phi_tmp[384 + i] += 2.0 * 2.0 * yc[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SYY; - phi_tmp[384 + i] += 2.0 * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * 4.0 * yc[i] * zc[i] * SY; - phi_tmp[384 + i] += 4.0 * zc[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SYY; - phi_tmp[416 + i] += 2.0 * 3.0 * zc_pow[i] * SY; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYYZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } - - // Combine YZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[64 + i] * SYZZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SYZZ; - phi_tmp[32 + i] += xc_pow[32 + i] * SZZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SYZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[32 + i] * SYZ; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SYZZ; - phi_tmp[96 + i] += 2.0 * xc_pow[i] * yc[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SYZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * yc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[i] * SZ; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SYZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc_pow[i] * zc[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[i] * SY; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SYZZ; - phi_tmp[192 + i] += 3.0 * xc[i] * yc_pow[i] * SZZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SYZZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * SYZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SYZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc[i] * SYZ; - phi_tmp[256 + i] += xc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * SY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc[i] * S0[i]; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc[i] * zc_pow[i] * SYZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc[i] * SY; - - phi_tmp[320 + i] = yc_pow[64 + i] * SYZZ; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SZZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SYZZ; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * SYZ; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * 3.0 * yc_pow[i] * SZ; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SYZZ; - phi_tmp[384 + i] += 2.0 * 2.0 * yc_pow[i] * zc[i] * SYZ; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * 4.0 * yc[i] * zc[i] * SZ; - phi_tmp[384 + i] += 4.0 * yc[i] * S0[i]; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[416 + i] += 2.0 * 3.0 * yc[i] * zc_pow[i] * SYZ; - phi_tmp[416 + i] += zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc[i] * SY; - phi_tmp[416 + i] += 2.0 * 3.0 * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * zc[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SYZZ; - phi_tmp[448 + i] += 2.0 * 4.0 * zc_pow[32 + i] * SYZ; - phi_tmp[448 + i] += 12.0 * zc_pow[i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } - - // Combine ZZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - - phi_tmp[i] = xc_pow[64 + i] * SZZZ; - - phi_tmp[32 + i] = xc_pow[32 + i] * yc[i] * SZZZ; - - phi_tmp[64 + i] = xc_pow[32 + i] * zc[i] * SZZZ; - phi_tmp[64 + i] += 3.0 * xc_pow[32 + i] * SZZ; - - phi_tmp[96 + i] = xc_pow[i] * yc_pow[i] * SZZZ; - - phi_tmp[128 + i] = xc_pow[i] * yc[i] * zc[i] * SZZZ; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * SZZ; - - phi_tmp[160 + i] = xc_pow[i] * zc_pow[i] * SZZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc_pow[i] * zc[i] * SZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc_pow[i] * SZ; - - phi_tmp[192 + i] = xc[i] * yc_pow[32 + i] * SZZZ; - - phi_tmp[224 + i] = xc[i] * yc_pow[i] * zc[i] * SZZZ; - phi_tmp[224 + i] += 3.0 * xc[i] * yc_pow[i] * SZZ; - - phi_tmp[256 + i] = xc[i] * yc[i] * zc_pow[i] * SZZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc[i] * SZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * xc[i] * yc[i] * SZ; - - phi_tmp[288 + i] = xc[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[288 + i] += 3.0 * 3.0 * xc[i] * zc_pow[i] * SZZ; - phi_tmp[288 + i] += 3.0 * 6.0 * xc[i] * zc[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc[i] * S0[i]; - - phi_tmp[320 + i] = yc_pow[64 + i] * SZZZ; - - phi_tmp[352 + i] = yc_pow[32 + i] * zc[i] * SZZZ; - phi_tmp[352 + i] += 3.0 * yc_pow[32 + i] * SZZ; - - phi_tmp[384 + i] = yc_pow[i] * zc_pow[i] * SZZZ; - phi_tmp[384 + i] += 3.0 * 2.0 * yc_pow[i] * zc[i] * SZZ; - phi_tmp[384 + i] += 3.0 * 2.0 * yc_pow[i] * SZ; - - phi_tmp[416 + i] = yc[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[416 + i] += 3.0 * 3.0 * yc[i] * zc_pow[i] * SZZ; - phi_tmp[416 + i] += 3.0 * 6.0 * yc[i] * zc[i] * SZ; - phi_tmp[416 + i] += 6.0 * yc[i] * S0[i]; - - phi_tmp[448 + i] = zc_pow[64 + i] * SZZZ; - phi_tmp[448 + i] += 3.0 * 4.0 * zc_pow[32 + i] * SZZ; - phi_tmp[448 + i] += 3.0 * 12.0 * zc_pow[i] * SZ; - phi_tmp[448 + i] += 24.0 * zc[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L5_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 21; - const unsigned long nspherical = 11; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 128 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 128 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 128 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - xc_pow[64 + i] = xc_pow[32 + i] * xc[i]; - yc_pow[64 + i] = yc_pow[32 + i] * yc[i]; - zc_pow[64 + i] = zc_pow[32 + i] * zc[i]; - xc_pow[96 + i] = xc_pow[64 + i] * xc[i]; - yc_pow[96 + i] = yc_pow[64 + i] * yc[i]; - zc_pow[96 + i] = zc_pow[64 + i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[96 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * S0[i]; - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[480 + i] = yc_pow[96 + i] * S0[i]; - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * S0[i]; - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * S0[i]; - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[640 + i] = zc_pow[96 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SX; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SX; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SX; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SX; - phi_tmp[320 + i] += yc_pow[64 + i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SX; - phi_tmp[448 + i] += zc_pow[64 + i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[64 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SY; - phi_tmp[608 + i] += zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[64 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[512 + i] += yc_pow[64 + i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXX; - phi_tmp[i] += 10.0 * xc_pow[64 + i] * SX; - phi_tmp[i] += 20.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 8.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[32 + i] += 12.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 8.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[64 + i] += 12.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 6.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 6.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 6.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXX; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * SX; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXX; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * SX; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXY; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[64 + i] * SX; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 4.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[320 + i] += yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXY; - phi_tmp[448 + i] += zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[608 + i] += zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXZ; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SX; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SX; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXZ; - phi_tmp[320 + i] += yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SX; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[448 + i] += zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[64 + i] * SY; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 8.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYY; - phi_tmp[480 + i] += 10.0 * yc_pow[64 + i] * SY; - phi_tmp[480 + i] += 20.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 8.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[512 + i] += 12.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 6.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[608 + i] += 2.0 * zc_pow[64 + i] * SY; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[64 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[32 + i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYZ; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[608 + i] += zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[96 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[64 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 8.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SZZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * SZ; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 8.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SZZ; - phi_tmp[640 + i] += 10.0 * zc_pow[64 + i] * SZ; - phi_tmp[640 + i] += 20.0 * zc_pow[32 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - // Combine XXX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXXX; - phi_tmp[i] += 3.0 * 5.0 * xc_pow[64 + i] * SXX; - phi_tmp[i] += 3.0 * 20.0 * xc_pow[32 + i] * SX; - phi_tmp[i] += 60.0 * xc_pow[i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXXX; - phi_tmp[32 + i] += 3.0 * 4.0 * xc_pow[32 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 3.0 * 12.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[32 + i] += 24.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXXX; - phi_tmp[64 + i] += 3.0 * 4.0 * xc_pow[32 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 3.0 * 12.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[64 + i] += 24.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXXX; - phi_tmp[96 + i] += 3.0 * 3.0 * xc_pow[i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 3.0 * 6.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 6.0 * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXXX; - phi_tmp[128 + i] += 3.0 * 3.0 * xc_pow[i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 3.0 * 6.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXXX; - phi_tmp[160 + i] += 3.0 * 3.0 * xc_pow[i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 3.0 * 6.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 6.0 * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXXX; - phi_tmp[192 + i] += 3.0 * 2.0 * xc[i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 3.0 * 2.0 * yc_pow[32 + i] * SX; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXXX; - phi_tmp[224 + i] += 3.0 * 2.0 * xc[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 3.0 * 2.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXXX; - phi_tmp[256 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 3.0 * 2.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXXX; - phi_tmp[288 + i] += 3.0 * 2.0 * xc[i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 3.0 * 2.0 * zc_pow[32 + i] * SX; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXXX; - phi_tmp[320 + i] += 3.0 * yc_pow[64 + i] * SXX; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXXX; - phi_tmp[352 + i] += 3.0 * yc_pow[32 + i] * zc[i] * SXX; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXXX; - phi_tmp[384 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SXX; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXXX; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[32 + i] * SXX; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXXX; - phi_tmp[448 + i] += 3.0 * zc_pow[64 + i] * SXX; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXXX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXXX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXXX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXXX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXXX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } - - // Combine XXY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXXY; - phi_tmp[i] += 2.0 * 5.0 * xc_pow[64 + i] * SXY; - phi_tmp[i] += 20.0 * xc_pow[32 + i] * SY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXXY; - phi_tmp[32 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[64 + i] * SXX; - phi_tmp[32 + i] += 12.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[32 + i] += 2.0 * 4.0 * xc_pow[32 + i] * SX; - phi_tmp[32 + i] += 12.0 * xc_pow[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXXY; - phi_tmp[64 + i] += 2.0 * 4.0 * xc_pow[32 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 12.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXXY; - phi_tmp[96 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SXX; - phi_tmp[96 + i] += 6.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * 6.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[96 + i] += 12.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXXY; - phi_tmp[128 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SXX; - phi_tmp[128 + i] += 6.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 2.0 * 3.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[128 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXXY; - phi_tmp[160 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 6.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXXY; - phi_tmp[192 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SXX; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 2.0 * 6.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[192 + i] += 6.0 * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXXY; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SXX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += 4.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXXY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 2.0 * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXXY; - phi_tmp[288 + i] += 2.0 * 2.0 * xc[i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXXY; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SXX; - phi_tmp[320 + i] += 2.0 * 4.0 * yc_pow[32 + i] * SX; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXXY; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[352 + i] += 2.0 * 3.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXXY; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 2.0 * 2.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXXY; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXXY; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * SXY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXXY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SXX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXXY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SXX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXXY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SXX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXXY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SXX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXXY; - phi_tmp[608 + i] += zc_pow[64 + i] * SXX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } - - // Combine XXZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXXZ; - phi_tmp[i] += 2.0 * 5.0 * xc_pow[64 + i] * SXZ; - phi_tmp[i] += 20.0 * xc_pow[32 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXXZ; - phi_tmp[32 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 12.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXXZ; - phi_tmp[64 + i] += 2.0 * 4.0 * xc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SXX; - phi_tmp[64 + i] += 12.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[64 + i] += 2.0 * 4.0 * xc_pow[32 + i] * SX; - phi_tmp[64 + i] += 12.0 * xc_pow[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXXZ; - phi_tmp[96 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 6.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXXZ; - phi_tmp[128 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SXX; - phi_tmp[128 + i] += 6.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[128 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXXZ; - phi_tmp[160 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SXX; - phi_tmp[160 + i] += 6.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * 6.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[160 + i] += 12.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXXZ; - phi_tmp[192 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += 2.0 * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXXZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SXX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[224 + i] += 2.0 * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXXZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SXX; - phi_tmp[256 + i] += 2.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += 4.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[288 + i] += 2.0 * 2.0 * xc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SXX; - phi_tmp[288 + i] += 2.0 * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 2.0 * 6.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[288 + i] += 6.0 * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXXZ; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * SXZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXXZ; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SXX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * SX; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXXZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[384 + i] += 2.0 * 2.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[416 + i] += 2.0 * 3.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXXZ; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SXX; - phi_tmp[448 + i] += 2.0 * 4.0 * zc_pow[32 + i] * SX; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXXZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXXZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SXX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXXZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SXX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SXX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXXZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SXX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXXZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } - - // Combine XYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXYY; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXYY; - phi_tmp[32 + i] += 2.0 * xc_pow[64 + i] * SXY; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * 4.0 * xc_pow[32 + i] * SY; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXYY; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXYY; - phi_tmp[96 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * SXY; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * SX; - phi_tmp[96 + i] += 2.0 * 6.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[96 + i] += 6.0 * xc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXYY; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SXY; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * 3.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXYY; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXYY; - phi_tmp[192 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SXY; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[192 + i] += 2.0 * 6.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 12.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXYY; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 4.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXYY; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXYY; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXYY; - phi_tmp[320 + i] += 2.0 * 4.0 * xc[i] * yc_pow[32 + i] * SXY; - phi_tmp[320 + i] += yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 12.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[320 + i] += 2.0 * 4.0 * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXYY; - phi_tmp[352 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * 3.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXYY; - phi_tmp[384 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 2.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * 2.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXYY; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXYY; - phi_tmp[448 + i] += zc_pow[64 + i] * SYY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXYY; - phi_tmp[480 + i] += 2.0 * 5.0 * yc_pow[64 + i] * SXY; - phi_tmp[480 + i] += 20.0 * yc_pow[32 + i] * SX; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXYY; - phi_tmp[512 + i] += 2.0 * 4.0 * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[512 + i] += 12.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXYY; - phi_tmp[544 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += 6.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXYY; - phi_tmp[576 + i] += 2.0 * 2.0 * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += 2.0 * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXYY; - phi_tmp[608 + i] += 2.0 * zc_pow[64 + i] * SXY; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } - - // Combine XYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXYZ; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXYZ; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[64 + i] * SXZ; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXYZ; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SXY; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * SY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXYZ; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SXZ; - phi_tmp[96 + i] += 6.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXYZ; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SXY; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[32 + i] * SX; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXYZ; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SXY; - phi_tmp[160 + i] += 6.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXYZ; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SXZ; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXYZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SXY; - phi_tmp[224 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[224 + i] += 8.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXYZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SXY; - phi_tmp[256 + i] += 2.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[256 + i] += 2.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SXY; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXYZ; - phi_tmp[320 + i] += yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SXZ; - phi_tmp[320 + i] += 4.0 * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXYZ; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SXY; - phi_tmp[352 + i] += 3.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[352 + i] += 9.0 * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXYZ; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[384 + i] += 2.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[384 + i] += 4.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[416 + i] += zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += 3.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXYZ; - phi_tmp[448 + i] += zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SXY; - phi_tmp[448 + i] += 4.0 * zc_pow[32 + i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXYZ; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SXZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXYZ; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SXY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * SX; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXYZ; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[544 + i] += 6.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[576 + i] += 6.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXYZ; - phi_tmp[608 + i] += zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[608 + i] += 4.0 * zc_pow[32 + i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXYZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } - - // Combine XZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SXZZ; - phi_tmp[i] += 5.0 * xc_pow[64 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SXZZ; - phi_tmp[32 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SXZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[64 + i] * SXZ; - phi_tmp[64 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * 4.0 * xc_pow[32 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SXZZ; - phi_tmp[96 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SXZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SXZ; - phi_tmp[128 + i] += 3.0 * xc_pow[i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SXZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[160 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * SX; - phi_tmp[160 + i] += 2.0 * 6.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[160 + i] += 6.0 * xc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SXZZ; - phi_tmp[192 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SXZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc_pow[i] * SXZ; - phi_tmp[224 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SXZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * zc[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * SX; - phi_tmp[256 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 4.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[288 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc[i] * SX; - phi_tmp[288 + i] += 2.0 * 6.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 12.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SXZZ; - phi_tmp[320 + i] += yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SXZZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SXZ; - phi_tmp[352 + i] += yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SXZZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[384 + i] += yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * 2.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[416 + i] += 2.0 * 3.0 * xc[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[416 + i] += yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc[i] * SX; - phi_tmp[416 + i] += 2.0 * 3.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SXZZ; - phi_tmp[448 + i] += 2.0 * 4.0 * xc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[448 + i] += zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 12.0 * xc[i] * zc_pow[i] * SX; - phi_tmp[448 + i] += 2.0 * 4.0 * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * zc_pow[i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SXZZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SXZZ; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * SXZ; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SXZZ; - phi_tmp[544 + i] += 2.0 * 2.0 * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * SX; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[576 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc[i] * SX; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SXZZ; - phi_tmp[608 + i] += 2.0 * 4.0 * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[608 + i] += 12.0 * yc[i] * zc_pow[i] * SX; - - phi_tmp[640 + i] = zc_pow[96 + i] * SXZZ; - phi_tmp[640 + i] += 2.0 * 5.0 * zc_pow[64 + i] * SXZ; - phi_tmp[640 + i] += 20.0 * zc_pow[32 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } - - // Combine YYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYYY; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYYY; - phi_tmp[32 + i] += 3.0 * xc_pow[64 + i] * SYY; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYYY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc_pow[32 + i] * yc[i] * SYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc_pow[32 + i] * SY; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYYY; - phi_tmp[128 + i] += 3.0 * xc_pow[32 + i] * zc[i] * SYY; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYYY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYYY; - phi_tmp[192 + i] += 3.0 * 3.0 * xc_pow[i] * yc_pow[i] * SYY; - phi_tmp[192 + i] += 3.0 * 6.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYYY; - phi_tmp[224 + i] += 3.0 * 2.0 * xc_pow[i] * yc[i] * zc[i] * SYY; - phi_tmp[224 + i] += 3.0 * 2.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYYY; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SYY; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYYY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYYY; - phi_tmp[320 + i] += 3.0 * 4.0 * xc[i] * yc_pow[32 + i] * SYY; - phi_tmp[320 + i] += 3.0 * 12.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[320 + i] += 24.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYYY; - phi_tmp[352 + i] += 3.0 * 3.0 * xc[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[352 + i] += 3.0 * 6.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYYY; - phi_tmp[384 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 3.0 * 2.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYYY; - phi_tmp[416 + i] += 3.0 * xc[i] * zc_pow[32 + i] * SYY; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYYY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYYY; - phi_tmp[480 + i] += 3.0 * 5.0 * yc_pow[64 + i] * SYY; - phi_tmp[480 + i] += 3.0 * 20.0 * yc_pow[32 + i] * SY; - phi_tmp[480 + i] += 60.0 * yc_pow[i] * S0[i]; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYYY; - phi_tmp[512 + i] += 3.0 * 4.0 * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 3.0 * 12.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[512 + i] += 24.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYYY; - phi_tmp[544 + i] += 3.0 * 3.0 * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 3.0 * 6.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYYY; - phi_tmp[576 + i] += 3.0 * 2.0 * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 3.0 * 2.0 * zc_pow[32 + i] * SY; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYYY; - phi_tmp[608 + i] += 3.0 * zc_pow[64 + i] * SYY; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } - - // Combine YYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYYZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYYZ; - phi_tmp[32 + i] += 2.0 * xc_pow[64 + i] * SYZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYYZ; - phi_tmp[64 + i] += xc_pow[64 + i] * SYY; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYYZ; - phi_tmp[96 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * SZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYYZ; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[32 + i] * yc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * SY; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SYY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYYZ; - phi_tmp[192 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SYZ; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYYZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc_pow[i] * yc_pow[i] * SYY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SYY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SYY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYYZ; - phi_tmp[320 + i] += 2.0 * 4.0 * xc[i] * yc_pow[32 + i] * SYZ; - phi_tmp[320 + i] += 12.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYYZ; - phi_tmp[352 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[352 + i] += xc[i] * yc_pow[32 + i] * SYY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYYZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[384 + i] += 2.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[384 + i] += 4.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * xc[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[416 + i] += 2.0 * 3.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYYZ; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SYY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYYZ; - phi_tmp[480 + i] += 2.0 * 5.0 * yc_pow[64 + i] * SYZ; - phi_tmp[480 + i] += 20.0 * yc_pow[32 + i] * SZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYYZ; - phi_tmp[512 + i] += 2.0 * 4.0 * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SYY; - phi_tmp[512 + i] += 12.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[512 + i] += 2.0 * 4.0 * yc_pow[32 + i] * SY; - phi_tmp[512 + i] += 12.0 * yc_pow[i] * S0[i]; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYYZ; - phi_tmp[544 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[544 + i] += 6.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * 6.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[544 + i] += 12.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[576 + i] += 2.0 * 2.0 * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[576 + i] += 2.0 * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 2.0 * 6.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 6.0 * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYYZ; - phi_tmp[608 + i] += 2.0 * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[608 + i] += 2.0 * 4.0 * zc_pow[32 + i] * SY; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYYZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } - - // Combine YZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[96 + i] * SYZZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SYZZ; - phi_tmp[32 + i] += xc_pow[64 + i] * SZZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SYZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[64 + i] * SYZ; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SYZZ; - phi_tmp[96 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SYZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[32 + i] * SZ; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SYZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[32 + i] * SY; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SYZZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SYZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc_pow[i] * SYZ; - phi_tmp[224 + i] += 2.0 * xc_pow[i] * yc[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SYZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * zc[i] * SYZ; - phi_tmp[256 + i] += xc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * yc[i] * SY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc[i] * SY; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SYZZ; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SYZZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SYZ; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SYZZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[416 + i] += 2.0 * 3.0 * xc[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[416 + i] += xc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc[i] * SY; - phi_tmp[416 + i] += 2.0 * 3.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SYZZ; - phi_tmp[448 + i] += 2.0 * 4.0 * xc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[448 + i] += 12.0 * xc[i] * zc_pow[i] * SY; - - phi_tmp[480 + i] = yc_pow[96 + i] * SYZZ; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SZZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SYZZ; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * SYZ; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * 4.0 * yc_pow[32 + i] * SZ; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SYZZ; - phi_tmp[544 + i] += 2.0 * 2.0 * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * SY; - phi_tmp[544 + i] += 2.0 * 6.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[544 + i] += 6.0 * yc_pow[i] * S0[i]; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[576 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc[i] * SY; - phi_tmp[576 + i] += 2.0 * 6.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 12.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SYZZ; - phi_tmp[608 + i] += 2.0 * 4.0 * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[608 + i] += zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 12.0 * yc[i] * zc_pow[i] * SY; - phi_tmp[608 + i] += 2.0 * 4.0 * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SYZZ; - phi_tmp[640 + i] += 2.0 * 5.0 * zc_pow[64 + i] * SYZ; - phi_tmp[640 + i] += 20.0 * zc_pow[32 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } - - // Combine ZZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - - phi_tmp[i] = xc_pow[96 + i] * SZZZ; - - phi_tmp[32 + i] = xc_pow[64 + i] * yc[i] * SZZZ; - - phi_tmp[64 + i] = xc_pow[64 + i] * zc[i] * SZZZ; - phi_tmp[64 + i] += 3.0 * xc_pow[64 + i] * SZZ; - - phi_tmp[96 + i] = xc_pow[32 + i] * yc_pow[i] * SZZZ; - - phi_tmp[128 + i] = xc_pow[32 + i] * yc[i] * zc[i] * SZZZ; - phi_tmp[128 + i] += 3.0 * xc_pow[32 + i] * yc[i] * SZZ; - - phi_tmp[160 + i] = xc_pow[32 + i] * zc_pow[i] * SZZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc_pow[32 + i] * SZ; - - phi_tmp[192 + i] = xc_pow[i] * yc_pow[32 + i] * SZZZ; - - phi_tmp[224 + i] = xc_pow[i] * yc_pow[i] * zc[i] * SZZZ; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SZZ; - - phi_tmp[256 + i] = xc_pow[i] * yc[i] * zc_pow[i] * SZZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * xc_pow[i] * yc[i] * zc[i] * SZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * xc_pow[i] * yc[i] * SZ; - - phi_tmp[288 + i] = xc_pow[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[288 + i] += 3.0 * 3.0 * xc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[288 + i] += 3.0 * 6.0 * xc_pow[i] * zc[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc[i] * yc_pow[64 + i] * SZZZ; - - phi_tmp[352 + i] = xc[i] * yc_pow[32 + i] * zc[i] * SZZZ; - phi_tmp[352 + i] += 3.0 * xc[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[384 + i] = xc[i] * yc_pow[i] * zc_pow[i] * SZZZ; - phi_tmp[384 + i] += 3.0 * 2.0 * xc[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[384 + i] += 3.0 * 2.0 * xc[i] * yc_pow[i] * SZ; - - phi_tmp[416 + i] = xc[i] * yc[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[416 + i] += 3.0 * 3.0 * xc[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[416 + i] += 3.0 * 6.0 * xc[i] * yc[i] * zc[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * S0[i]; - - phi_tmp[448 + i] = xc[i] * zc_pow[64 + i] * SZZZ; - phi_tmp[448 + i] += 3.0 * 4.0 * xc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[448 + i] += 3.0 * 12.0 * xc[i] * zc_pow[i] * SZ; - phi_tmp[448 + i] += 24.0 * xc[i] * zc[i] * S0[i]; - - phi_tmp[480 + i] = yc_pow[96 + i] * SZZZ; - - phi_tmp[512 + i] = yc_pow[64 + i] * zc[i] * SZZZ; - phi_tmp[512 + i] += 3.0 * yc_pow[64 + i] * SZZ; - - phi_tmp[544 + i] = yc_pow[32 + i] * zc_pow[i] * SZZZ; - phi_tmp[544 + i] += 3.0 * 2.0 * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[544 + i] += 3.0 * 2.0 * yc_pow[32 + i] * SZ; - - phi_tmp[576 + i] = yc_pow[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[576 + i] += 3.0 * 3.0 * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[576 + i] += 3.0 * 6.0 * yc_pow[i] * zc[i] * SZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * S0[i]; - - phi_tmp[608 + i] = yc[i] * zc_pow[64 + i] * SZZZ; - phi_tmp[608 + i] += 3.0 * 4.0 * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[608 + i] += 3.0 * 12.0 * yc[i] * zc_pow[i] * SZ; - phi_tmp[608 + i] += 24.0 * yc[i] * zc[i] * S0[i]; - - phi_tmp[640 + i] = zc_pow[96 + i] * SZZZ; - phi_tmp[640 + i] += 3.0 * 5.0 * zc_pow[64 + i] * SZZ; - phi_tmp[640 + i] += 3.0 * 20.0 * zc_pow[32 + i] * SZ; - phi_tmp[640 + i] += 60.0 * zc_pow[i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L6_deriv3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 28; - const unsigned long nspherical = 13; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 288 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - double* PRAGMA_RESTRICT S1 = cache_data + 192; - ASSUME_ALIGNED(S1, 64); - double* PRAGMA_RESTRICT S2 = cache_data + 224; - ASSUME_ALIGNED(S2, 64); - double* PRAGMA_RESTRICT S3 = cache_data + 256; - ASSUME_ALIGNED(S3, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - double* PRAGMA_RESTRICT expn2 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate power temporaries - double* PRAGMA_RESTRICT xc_pow = (double*)ALIGNED_MALLOC(64, 160 * sizeof(double)); - ASSUME_ALIGNED(xc_pow, 64); - double* PRAGMA_RESTRICT yc_pow = (double*)ALIGNED_MALLOC(64, 160 * sizeof(double)); - ASSUME_ALIGNED(yc_pow, 64); - double* PRAGMA_RESTRICT zc_pow = (double*)ALIGNED_MALLOC(64, 160 * sizeof(double)); - ASSUME_ALIGNED(zc_pow, 64); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - double AX, AY, AZ; - double AXX, AXY, AXZ, AYY, AYZ, AZZ; - double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - expn2[i] = -2.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - S1[i] = 0.0; - S2[i] = 0.0; - S3[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - const double alpha_n2 = expn2[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - const double T2 = alpha_n2 * T1; - S1[i] += T2; - const double T3 = alpha_n2 * T2; - S2[i] += T3; - const double T4 = alpha_n2 * T3; - S3[i] += T4; - } - - } - - // Build powers - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - xc_pow[i] = xc[i] * xc[i]; - yc_pow[i] = yc[i] * yc[i]; - zc_pow[i] = zc[i] * zc[i]; - xc_pow[32 + i] = xc_pow[i] * xc[i]; - yc_pow[32 + i] = yc_pow[i] * yc[i]; - zc_pow[32 + i] = zc_pow[i] * zc[i]; - xc_pow[64 + i] = xc_pow[32 + i] * xc[i]; - yc_pow[64 + i] = yc_pow[32 + i] * yc[i]; - zc_pow[64 + i] = zc_pow[32 + i] * zc[i]; - xc_pow[96 + i] = xc_pow[64 + i] * xc[i]; - yc_pow[96 + i] = yc_pow[64 + i] * yc[i]; - zc_pow[96 + i] = zc_pow[64 + i] * zc[i]; - xc_pow[128 + i] = xc_pow[96 + i] * xc[i]; - yc_pow[128 + i] = yc_pow[96 + i] * yc[i]; - zc_pow[128 + i] = zc_pow[96 + i] * zc[i]; - } - // Combine A blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - phi_tmp[i] = xc_pow[128 + i] * S0[i]; - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * S0[i]; - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * S0[i]; - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * S0[i]; - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * S0[i]; - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * S0[i]; - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * S0[i]; - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * S0[i]; - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * S0[i]; - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * S0[i]; - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * S0[i]; - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * S0[i]; - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * S0[i]; - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * S0[i]; - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * S0[i]; - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * S0[i]; - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * S0[i]; - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * S0[i]; - phi_tmp[672 + i] = yc_pow[128 + i] * S0[i]; - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * S0[i]; - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * S0[i]; - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * S0[i]; - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * S0[i]; - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * S0[i]; - phi_tmp[864 + i] = zc_pow[128 + i] * S0[i]; - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - // Combine X blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SX; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SX; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SX; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SX; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SX; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SX; - phi_tmp[480 + i] += yc_pow[96 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SX; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SX; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SX; - phi_tmp[640 + i] += zc_pow[96 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_x_out + start), npoints); - } - - // Combine Y blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SY; - phi_tmp[32 + i] += xc_pow[96 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SY; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SY; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * S0[i]; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SY; - phi_tmp[832 + i] += zc_pow[96 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_y_out + start), npoints); - } - - // Combine Z blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SZ; - phi_tmp[64 + i] += xc_pow[96 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SZ; - phi_tmp[704 + i] += yc_pow[96 + i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_z_out + start), npoints); - } - - // Combine XX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXX; - phi_tmp[i] += 12.0 * xc_pow[96 + i] * SX; - phi_tmp[i] += 30.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 10.0 * xc_pow[64 + i] * yc[i] * SX; - phi_tmp[32 + i] += 20.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 10.0 * xc_pow[64 + i] * zc[i] * SX; - phi_tmp[64 + i] += 20.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 8.0 * xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 12.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 8.0 * xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 12.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 8.0 * xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 12.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 6.0 * xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 6.0 * xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 6.0 * xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 6.0 * xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXX; - phi_tmp[320 + i] += 4.0 * xc[i] * yc_pow[64 + i] * SX; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[352 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXX; - phi_tmp[448 + i] += 4.0 * xc[i] * zc_pow[64 + i] * SX; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXX; - phi_tmp[480 + i] += 2.0 * yc_pow[96 + i] * SX; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXX; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[576 + i] += 2.0 * yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXX; - phi_tmp[608 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXX; - phi_tmp[640 + i] += 2.0 * zc_pow[96 + i] * SX; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xx_out + start), npoints); - } - - // Combine XY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXY; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[96 + i] * SX; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SY; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SX; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 8.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 9.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 8.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXY; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXY; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SX; - phi_tmp[480 + i] += yc_pow[96 + i] * SY; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SX; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SY; - phi_tmp[608 + i] += zc_pow[64 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXY; - phi_tmp[640 + i] += zc_pow[96 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXY; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXY; - phi_tmp[832 + i] += zc_pow[96 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xy_out + start), npoints); - } - - // Combine XZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXZ; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SX; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SX; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 8.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SX; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SX; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 9.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXZ; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SX; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SX; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 8.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXZ; - phi_tmp[480 + i] += yc_pow[96 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SX; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[512 + i] += yc_pow[64 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SX; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SX; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SX; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SX; - phi_tmp[640 + i] += zc_pow[96 + i] * SZ; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xz_out + start), npoints); - } - - // Combine YY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * xc_pow[96 + i] * SY; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 4.0 * xc_pow[64 + i] * yc[i] * SY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 8.0 * xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 12.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYY; - phi_tmp[480 + i] += 10.0 * xc[i] * yc_pow[64 + i] * SY; - phi_tmp[480 + i] += 20.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 8.0 * xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[512 + i] += 12.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[608 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYY; - phi_tmp[672 + i] += 12.0 * yc_pow[96 + i] * SY; - phi_tmp[672 + i] += 30.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYY; - phi_tmp[704 + i] += 10.0 * yc_pow[64 + i] * zc[i] * SY; - phi_tmp[704 + i] += 20.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYY; - phi_tmp[736 + i] += 8.0 * yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[736 + i] += 12.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYY; - phi_tmp[768 + i] += 6.0 * yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[768 + i] += 6.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYY; - phi_tmp[800 + i] += 4.0 * yc[i] * zc_pow[64 + i] * SY; - phi_tmp[800 + i] += 2.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYY; - phi_tmp[832 + i] += 2.0 * zc_pow[96 + i] * SY; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yy_out + start), npoints); - } - - // Combine YZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[96 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[128 + i] += xc_pow[64 + i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYZ; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYZ; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[736 + i] += 8.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[768 + i] += 9.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[800 + i] += 8.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SY; - phi_tmp[832 + i] += zc_pow[96 + i] * SZ; - phi_tmp[832 + i] += 5.0 * zc_pow[64 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yz_out + start), npoints); - } - - // Combine ZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - - phi_tmp[i] = xc_pow[128 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[96 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 4.0 * xc_pow[64 + i] * zc[i] * SZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * xc_pow[i] * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 8.0 * xc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 12.0 * xc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SZZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SZ; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 8.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * xc[i] * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SZZ; - phi_tmp[640 + i] += 10.0 * xc[i] * zc_pow[64 + i] * SZ; - phi_tmp[640 + i] += 20.0 * xc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SZZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SZZ; - phi_tmp[704 + i] += 2.0 * yc_pow[96 + i] * SZ; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SZZ; - phi_tmp[736 + i] += 4.0 * yc_pow[64 + i] * zc[i] * SZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SZZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * zc_pow[i] * SZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * zc[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SZZ; - phi_tmp[800 + i] += 8.0 * yc_pow[i] * zc_pow[32 + i] * SZ; - phi_tmp[800 + i] += 12.0 * yc_pow[i] * zc_pow[i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SZZ; - phi_tmp[832 + i] += 10.0 * yc[i] * zc_pow[64 + i] * SZ; - phi_tmp[832 + i] += 20.0 * yc[i] * zc_pow[32 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SZZ; - phi_tmp[864 + i] += 12.0 * zc_pow[96 + i] * SZ; - phi_tmp[864 + i] += 30.0 * zc_pow[64 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_zz_out + start), npoints); - } - - // Combine XXX blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXX = S3[i] * xc[i] * xc[i] * xc[i] + 3 * xc[i] * S2[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXXX; - phi_tmp[i] += 3.0 * 6.0 * xc_pow[96 + i] * SXX; - phi_tmp[i] += 3.0 * 30.0 * xc_pow[64 + i] * SX; - phi_tmp[i] += 120.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXXX; - phi_tmp[32 + i] += 3.0 * 5.0 * xc_pow[64 + i] * yc[i] * SXX; - phi_tmp[32 + i] += 3.0 * 20.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[32 + i] += 60.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXXX; - phi_tmp[64 + i] += 3.0 * 5.0 * xc_pow[64 + i] * zc[i] * SXX; - phi_tmp[64 + i] += 3.0 * 20.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[64 + i] += 60.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXXX; - phi_tmp[96 + i] += 3.0 * 4.0 * xc_pow[32 + i] * yc_pow[i] * SXX; - phi_tmp[96 + i] += 3.0 * 12.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[96 + i] += 24.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXXX; - phi_tmp[128 + i] += 3.0 * 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXX; - phi_tmp[128 + i] += 3.0 * 12.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[128 + i] += 24.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXXX; - phi_tmp[160 + i] += 3.0 * 4.0 * xc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[160 + i] += 3.0 * 12.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[160 + i] += 24.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXXX; - phi_tmp[192 + i] += 3.0 * 3.0 * xc_pow[i] * yc_pow[32 + i] * SXX; - phi_tmp[192 + i] += 3.0 * 6.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[192 + i] += 6.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXXX; - phi_tmp[224 + i] += 3.0 * 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[224 + i] += 3.0 * 6.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[224 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXXX; - phi_tmp[256 + i] += 3.0 * 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 3.0 * 6.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXXX; - phi_tmp[288 + i] += 3.0 * 3.0 * xc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[288 + i] += 3.0 * 6.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[288 + i] += 6.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXXX; - phi_tmp[320 + i] += 3.0 * 2.0 * xc[i] * yc_pow[64 + i] * SXX; - phi_tmp[320 + i] += 3.0 * 2.0 * yc_pow[64 + i] * SX; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXXX; - phi_tmp[352 + i] += 3.0 * 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[352 + i] += 3.0 * 2.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXXX; - phi_tmp[384 + i] += 3.0 * 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 3.0 * 2.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXXX; - phi_tmp[416 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 3.0 * 2.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXXX; - phi_tmp[448 + i] += 3.0 * 2.0 * xc[i] * zc_pow[64 + i] * SXX; - phi_tmp[448 + i] += 3.0 * 2.0 * zc_pow[64 + i] * SX; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXXX; - phi_tmp[480 + i] += 3.0 * yc_pow[96 + i] * SXX; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXXX; - phi_tmp[512 + i] += 3.0 * yc_pow[64 + i] * zc[i] * SXX; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXXX; - phi_tmp[544 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SXX; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXXX; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SXX; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXXX; - phi_tmp[608 + i] += 3.0 * yc[i] * zc_pow[64 + i] * SXX; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXXX; - phi_tmp[640 + i] += 3.0 * zc_pow[96 + i] * SXX; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXXX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXXX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXXX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXXX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXXX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXXX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xxx_out + start), npoints); - } - - // Combine XXY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXY = S3[i] * xc[i] * xc[i] * yc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXXY; - phi_tmp[i] += 2.0 * 6.0 * xc_pow[96 + i] * SXY; - phi_tmp[i] += 30.0 * xc_pow[64 + i] * SY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXXY; - phi_tmp[32 + i] += 2.0 * 5.0 * xc_pow[64 + i] * yc[i] * SXY; - phi_tmp[32 + i] += xc_pow[96 + i] * SXX; - phi_tmp[32 + i] += 20.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[32 + i] += 2.0 * 5.0 * xc_pow[64 + i] * SX; - phi_tmp[32 + i] += 20.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXXY; - phi_tmp[64 + i] += 2.0 * 5.0 * xc_pow[64 + i] * zc[i] * SXY; - phi_tmp[64 + i] += 20.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXXY; - phi_tmp[96 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc_pow[i] * SXY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SXX; - phi_tmp[96 + i] += 12.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[96 + i] += 2.0 * 8.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[96 + i] += 24.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXXY; - phi_tmp[128 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXY; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SXX; - phi_tmp[128 + i] += 12.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[128 + i] += 2.0 * 4.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[128 + i] += 12.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXXY; - phi_tmp[160 + i] += 2.0 * 4.0 * xc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[160 + i] += 12.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXXY; - phi_tmp[192 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[32 + i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SXX; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[192 + i] += 2.0 * 9.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[192 + i] += 18.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXXY; - phi_tmp[224 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXX; - phi_tmp[224 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[224 + i] += 2.0 * 6.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[224 + i] += 12.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXXY; - phi_tmp[256 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[256 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[256 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[256 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXXY; - phi_tmp[288 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXXY; - phi_tmp[320 + i] += 2.0 * 2.0 * xc[i] * yc_pow[64 + i] * SXY; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SXX; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * SY; - phi_tmp[320 + i] += 2.0 * 8.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[320 + i] += 8.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXXY; - phi_tmp[352 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[352 + i] += 2.0 * 6.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[352 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXXY; - phi_tmp[384 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 4.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXXY; - phi_tmp[416 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[416 + i] += 2.0 * 2.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[416 + i] += 2.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXXY; - phi_tmp[448 + i] += 2.0 * 2.0 * xc[i] * zc_pow[64 + i] * SXY; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXXY; - phi_tmp[480 + i] += 2.0 * yc_pow[96 + i] * SXY; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SXX; - phi_tmp[480 + i] += 2.0 * 5.0 * yc_pow[64 + i] * SX; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXXY; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[512 + i] += 2.0 * 4.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXXY; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[544 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXXY; - phi_tmp[576 + i] += 2.0 * yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[576 + i] += 2.0 * 2.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXXY; - phi_tmp[608 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SXX; - phi_tmp[608 + i] += 2.0 * zc_pow[64 + i] * SX; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXXY; - phi_tmp[640 + i] += 2.0 * zc_pow[96 + i] * SXY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXXY; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SXX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXXY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SXX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXXY; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SXX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXXY; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SXX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXXY; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SXX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXXY; - phi_tmp[832 + i] += zc_pow[96 + i] * SXX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xxy_out + start), npoints); - } - - // Combine XXZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SXX = S2[i] * xc[i] * xc[i] + S1[i]; - const double SXXZ = S3[i] * xc[i] * xc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXXZ; - phi_tmp[i] += 2.0 * 6.0 * xc_pow[96 + i] * SXZ; - phi_tmp[i] += 30.0 * xc_pow[64 + i] * SZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXXZ; - phi_tmp[32 + i] += 2.0 * 5.0 * xc_pow[64 + i] * yc[i] * SXZ; - phi_tmp[32 + i] += 20.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXXZ; - phi_tmp[64 + i] += 2.0 * 5.0 * xc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SXX; - phi_tmp[64 + i] += 20.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[64 + i] += 2.0 * 5.0 * xc_pow[64 + i] * SX; - phi_tmp[64 + i] += 20.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXXZ; - phi_tmp[96 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc_pow[i] * SXZ; - phi_tmp[96 + i] += 12.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXXZ; - phi_tmp[128 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SXX; - phi_tmp[128 + i] += 12.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[128 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[128 + i] += 12.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXXZ; - phi_tmp[160 + i] += 2.0 * 4.0 * xc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SXX; - phi_tmp[160 + i] += 12.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[160 + i] += 2.0 * 8.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[160 + i] += 24.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXXZ; - phi_tmp[192 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[32 + i] * SXZ; - phi_tmp[192 + i] += 6.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXXZ; - phi_tmp[224 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SXX; - phi_tmp[224 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[224 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXXZ; - phi_tmp[256 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXX; - phi_tmp[256 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 2.0 * 6.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[256 + i] += 12.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXXZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SXX; - phi_tmp[288 + i] += 6.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[288 + i] += 2.0 * 9.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[288 + i] += 18.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXXZ; - phi_tmp[320 + i] += 2.0 * 2.0 * xc[i] * yc_pow[64 + i] * SXZ; - phi_tmp[320 + i] += 2.0 * yc_pow[64 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXXZ; - phi_tmp[352 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SXX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[352 + i] += 2.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXXZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXX; - phi_tmp[384 + i] += 2.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * 4.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[384 + i] += 4.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[416 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXX; - phi_tmp[416 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 2.0 * 6.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXXZ; - phi_tmp[448 + i] += 2.0 * 2.0 * xc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SXX; - phi_tmp[448 + i] += 2.0 * zc_pow[64 + i] * SZ; - phi_tmp[448 + i] += 2.0 * 8.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[448 + i] += 8.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXXZ; - phi_tmp[480 + i] += 2.0 * yc_pow[96 + i] * SXZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXXZ; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SXX; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * SX; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXXZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXX; - phi_tmp[544 + i] += 2.0 * 2.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXXZ; - phi_tmp[576 + i] += 2.0 * yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXX; - phi_tmp[576 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXXZ; - phi_tmp[608 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXX; - phi_tmp[608 + i] += 2.0 * 4.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXXZ; - phi_tmp[640 + i] += 2.0 * zc_pow[96 + i] * SXZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SXX; - phi_tmp[640 + i] += 2.0 * 5.0 * zc_pow[64 + i] * SX; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXXZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXXZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SXX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXXZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SXX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXXZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SXX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXXZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SXX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXXZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SXX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXXZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SXX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xxz_out + start), npoints); - } - - // Combine XYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SXYY = S3[i] * xc[i] * yc[i] * yc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXYY; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SYY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXYY; - phi_tmp[32 + i] += 2.0 * xc_pow[96 + i] * SXY; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SYY; - phi_tmp[32 + i] += 2.0 * 5.0 * xc_pow[64 + i] * SY; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXYY; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SYY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXYY; - phi_tmp[96 + i] += 2.0 * 2.0 * xc_pow[64 + i] * yc[i] * SXY; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SYY; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * SX; - phi_tmp[96 + i] += 2.0 * 8.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[96 + i] += 8.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXYY; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SXY; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SYY; - phi_tmp[128 + i] += 2.0 * 4.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXYY; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SYY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXYY; - phi_tmp[192 + i] += 2.0 * 3.0 * xc_pow[32 + i] * yc_pow[i] * SXY; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SYY; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[192 + i] += 2.0 * 9.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[192 + i] += 18.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXYY; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXY; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[224 + i] += 2.0 * 6.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[224 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXYY; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[256 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXYY; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SYY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXYY; - phi_tmp[320 + i] += 2.0 * 4.0 * xc_pow[i] * yc_pow[32 + i] * SXY; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SYY; - phi_tmp[320 + i] += 12.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[320 + i] += 2.0 * 8.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[320 + i] += 24.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXYY; - phi_tmp[352 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[352 + i] += 2.0 * 6.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[352 + i] += 12.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXYY; - phi_tmp[384 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[384 + i] += 4.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXYY; - phi_tmp[416 + i] += 2.0 * xc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[416 + i] += 2.0 * 2.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXYY; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SYY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXYY; - phi_tmp[480 + i] += 2.0 * 5.0 * xc[i] * yc_pow[64 + i] * SXY; - phi_tmp[480 + i] += yc_pow[96 + i] * SYY; - phi_tmp[480 + i] += 20.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[480 + i] += 2.0 * 5.0 * yc_pow[64 + i] * SY; - phi_tmp[480 + i] += 20.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXYY; - phi_tmp[512 + i] += 2.0 * 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 12.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[512 + i] += 2.0 * 4.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[512 + i] += 12.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXYY; - phi_tmp[544 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[544 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXYY; - phi_tmp[576 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[576 + i] += 2.0 * 2.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[576 + i] += 2.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXYY; - phi_tmp[608 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SXY; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[608 + i] += 2.0 * zc_pow[64 + i] * SY; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXYY; - phi_tmp[640 + i] += zc_pow[96 + i] * SYY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXYY; - phi_tmp[672 + i] += 2.0 * 6.0 * yc_pow[96 + i] * SXY; - phi_tmp[672 + i] += 30.0 * yc_pow[64 + i] * SX; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXYY; - phi_tmp[704 + i] += 2.0 * 5.0 * yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[704 + i] += 20.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXYY; - phi_tmp[736 + i] += 2.0 * 4.0 * yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[736 + i] += 12.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXYY; - phi_tmp[768 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[768 + i] += 6.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXYY; - phi_tmp[800 + i] += 2.0 * 2.0 * yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[800 + i] += 2.0 * zc_pow[64 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXYY; - phi_tmp[832 + i] += 2.0 * zc_pow[96 + i] * SXY; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xyy_out + start), npoints); - } - - // Combine XYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SXY = S2[i] * xc[i] * yc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SXYZ = S3[i] * xc[i] * yc[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXYZ; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SYZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXYZ; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SYZ; - phi_tmp[32 + i] += xc_pow[96 + i] * SXZ; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * SZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXYZ; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SXY; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * SY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXYZ; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SXZ; - phi_tmp[96 + i] += 8.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXYZ; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SXY; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[128 + i] += xc_pow[64 + i] * SX; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXYZ; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SXY; - phi_tmp[160 + i] += 8.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SYZ; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SXZ; - phi_tmp[192 + i] += 9.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXYZ; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SXY; - phi_tmp[224 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[224 + i] += 12.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXYZ; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXY; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[256 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[288 + i] += 9.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXYZ; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SYZ; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SXZ; - phi_tmp[320 + i] += 8.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXYZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SXY; - phi_tmp[352 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[352 + i] += 18.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXYZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXY; - phi_tmp[384 + i] += 4.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[384 + i] += 8.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXY; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[416 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[416 + i] += 2.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXYZ; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[448 + i] += 8.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXYZ; - phi_tmp[480 + i] += yc_pow[96 + i] * SYZ; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SXZ; - phi_tmp[480 + i] += 5.0 * yc_pow[64 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXYZ; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SXY; - phi_tmp[512 + i] += 4.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[512 + i] += yc_pow[64 + i] * SY; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[512 + i] += 16.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXYZ; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXY; - phi_tmp[544 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[544 + i] += 9.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXYZ; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXY; - phi_tmp[576 + i] += 2.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 3.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[576 + i] += 4.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXYZ; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXY; - phi_tmp[608 + i] += zc_pow[64 + i] * SZ; - phi_tmp[608 + i] += 4.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[608 + i] += 4.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[608 + i] += zc_pow[32 + i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXYZ; - phi_tmp[640 + i] += zc_pow[96 + i] * SYZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SXY; - phi_tmp[640 + i] += 5.0 * zc_pow[64 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXYZ; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SXZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXYZ; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SXY; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * SX; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXYZ; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SXY; - phi_tmp[736 + i] += 8.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXYZ; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SXY; - phi_tmp[768 + i] += 9.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXYZ; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SXY; - phi_tmp[800 + i] += 8.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXYZ; - phi_tmp[832 + i] += zc_pow[96 + i] * SXZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SXY; - phi_tmp[832 + i] += 5.0 * zc_pow[64 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXYZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SXY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xyz_out + start), npoints); - } - - // Combine XZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SX = S1[i] * xc[i]; - const double SZ = S1[i] * zc[i]; - const double SXZ = S2[i] * xc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SXZZ = S3[i] * xc[i] * zc[i] * zc[i] + S2[i] * xc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SXZZ; - phi_tmp[i] += 6.0 * xc_pow[96 + i] * SZZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SXZZ; - phi_tmp[32 + i] += 5.0 * xc_pow[64 + i] * yc[i] * SZZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SXZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[96 + i] * SXZ; - phi_tmp[64 + i] += 5.0 * xc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[64 + i] += 2.0 * 5.0 * xc_pow[64 + i] * SZ; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SXZZ; - phi_tmp[96 + i] += 4.0 * xc_pow[32 + i] * yc_pow[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SXZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SXZ; - phi_tmp[128 + i] += 4.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * 4.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SXZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[160 + i] += 4.0 * xc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * SX; - phi_tmp[160 + i] += 2.0 * 8.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[160 + i] += 8.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SXZZ; - phi_tmp[192 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SXZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc_pow[i] * SXZ; - phi_tmp[224 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SXZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SXZ; - phi_tmp[256 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SX; - phi_tmp[256 + i] += 2.0 * 6.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[256 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SXZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[288 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * zc[i] * SX; - phi_tmp[288 + i] += 2.0 * 9.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[288 + i] += 18.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SXZZ; - phi_tmp[320 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SXZZ; - phi_tmp[352 + i] += 2.0 * xc_pow[i] * yc_pow[32 + i] * SXZ; - phi_tmp[352 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SXZZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SXZ; - phi_tmp[384 + i] += 2.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * SX; - phi_tmp[384 + i] += 2.0 * 4.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[416 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SXZ; - phi_tmp[416 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SX; - phi_tmp[416 + i] += 2.0 * 6.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 12.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SXZZ; - phi_tmp[448 + i] += 2.0 * 4.0 * xc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[448 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[448 + i] += 12.0 * xc_pow[i] * zc_pow[i] * SX; - phi_tmp[448 + i] += 2.0 * 8.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[448 + i] += 24.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SXZZ; - phi_tmp[480 + i] += yc_pow[96 + i] * SZZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SXZZ; - phi_tmp[512 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SXZ; - phi_tmp[512 + i] += yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * yc_pow[64 + i] * SZ; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SXZZ; - phi_tmp[544 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SXZ; - phi_tmp[544 + i] += yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SX; - phi_tmp[544 + i] += 2.0 * 2.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[544 + i] += 2.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SXZZ; - phi_tmp[576 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SXZ; - phi_tmp[576 + i] += yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SX; - phi_tmp[576 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 6.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SXZZ; - phi_tmp[608 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SXZ; - phi_tmp[608 + i] += yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 12.0 * xc[i] * yc[i] * zc_pow[i] * SX; - phi_tmp[608 + i] += 2.0 * 4.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SXZZ; - phi_tmp[640 + i] += 2.0 * 5.0 * xc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[640 + i] += zc_pow[96 + i] * SZZ; - phi_tmp[640 + i] += 20.0 * xc[i] * zc_pow[32 + i] * SX; - phi_tmp[640 + i] += 2.0 * 5.0 * zc_pow[64 + i] * SZ; - phi_tmp[640 + i] += 20.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SXZZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SXZZ; - phi_tmp[704 + i] += 2.0 * yc_pow[96 + i] * SXZ; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SXZZ; - phi_tmp[736 + i] += 2.0 * 2.0 * yc_pow[64 + i] * zc[i] * SXZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * SX; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SXZZ; - phi_tmp[768 + i] += 2.0 * 3.0 * yc_pow[32 + i] * zc_pow[i] * SXZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * zc[i] * SX; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SXZZ; - phi_tmp[800 + i] += 2.0 * 4.0 * yc_pow[i] * zc_pow[32 + i] * SXZ; - phi_tmp[800 + i] += 12.0 * yc_pow[i] * zc_pow[i] * SX; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SXZZ; - phi_tmp[832 + i] += 2.0 * 5.0 * yc[i] * zc_pow[64 + i] * SXZ; - phi_tmp[832 + i] += 20.0 * yc[i] * zc_pow[32 + i] * SX; - - phi_tmp[864 + i] = zc_pow[128 + i] * SXZZ; - phi_tmp[864 + i] += 2.0 * 6.0 * zc_pow[96 + i] * SXZ; - phi_tmp[864 + i] += 30.0 * zc_pow[64 + i] * SX; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_xzz_out + start), npoints); - } - - // Combine YYY blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYY = S3[i] * yc[i] * yc[i] * yc[i] + 3 * yc[i] * S2[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYYY; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYYY; - phi_tmp[32 + i] += 3.0 * xc_pow[96 + i] * SYY; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYYY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc_pow[64 + i] * yc[i] * SYY; - phi_tmp[96 + i] += 3.0 * 2.0 * xc_pow[64 + i] * SY; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYYY; - phi_tmp[128 + i] += 3.0 * xc_pow[64 + i] * zc[i] * SYY; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYYY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYYY; - phi_tmp[192 + i] += 3.0 * 3.0 * xc_pow[32 + i] * yc_pow[i] * SYY; - phi_tmp[192 + i] += 3.0 * 6.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYYY; - phi_tmp[224 + i] += 3.0 * 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SYY; - phi_tmp[224 + i] += 3.0 * 2.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYYY; - phi_tmp[256 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SYY; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYYY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYYY; - phi_tmp[320 + i] += 3.0 * 4.0 * xc_pow[i] * yc_pow[32 + i] * SYY; - phi_tmp[320 + i] += 3.0 * 12.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[320 + i] += 24.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYYY; - phi_tmp[352 + i] += 3.0 * 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[352 + i] += 3.0 * 6.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYYY; - phi_tmp[384 + i] += 3.0 * 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[384 + i] += 3.0 * 2.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYYY; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * zc_pow[32 + i] * SYY; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYYY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYYY; - phi_tmp[480 + i] += 3.0 * 5.0 * xc[i] * yc_pow[64 + i] * SYY; - phi_tmp[480 + i] += 3.0 * 20.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[480 + i] += 60.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYYY; - phi_tmp[512 + i] += 3.0 * 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[512 + i] += 3.0 * 12.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[512 + i] += 24.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYYY; - phi_tmp[544 + i] += 3.0 * 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[544 + i] += 3.0 * 6.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[544 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYYY; - phi_tmp[576 + i] += 3.0 * 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[576 + i] += 3.0 * 2.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYYY; - phi_tmp[608 + i] += 3.0 * xc[i] * zc_pow[64 + i] * SYY; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYYY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYYY; - phi_tmp[672 + i] += 3.0 * 6.0 * yc_pow[96 + i] * SYY; - phi_tmp[672 + i] += 3.0 * 30.0 * yc_pow[64 + i] * SY; - phi_tmp[672 + i] += 120.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYYY; - phi_tmp[704 + i] += 3.0 * 5.0 * yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[704 + i] += 3.0 * 20.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[704 + i] += 60.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYYY; - phi_tmp[736 + i] += 3.0 * 4.0 * yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[736 + i] += 3.0 * 12.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[736 + i] += 24.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYYY; - phi_tmp[768 + i] += 3.0 * 3.0 * yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[768 + i] += 3.0 * 6.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[768 + i] += 6.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYYY; - phi_tmp[800 + i] += 3.0 * 2.0 * yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[800 + i] += 3.0 * 2.0 * zc_pow[64 + i] * SY; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYYY; - phi_tmp[832 + i] += 3.0 * zc_pow[96 + i] * SYY; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yyy_out + start), npoints); - } - - // Combine YYZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SYY = S2[i] * yc[i] * yc[i] + S1[i]; - const double SYYZ = S3[i] * yc[i] * yc[i] * zc[i] + S2[i] * zc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYYZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYYZ; - phi_tmp[32 + i] += 2.0 * xc_pow[96 + i] * SYZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYYZ; - phi_tmp[64 + i] += xc_pow[96 + i] * SYY; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYYZ; - phi_tmp[96 + i] += 2.0 * 2.0 * xc_pow[64 + i] * yc[i] * SYZ; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * SZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYYZ; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[64 + i] * yc[i] * SYY; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * SY; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * zc[i] * SYY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYYZ; - phi_tmp[192 + i] += 2.0 * 3.0 * xc_pow[32 + i] * yc_pow[i] * SYZ; - phi_tmp[192 + i] += 6.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYYZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[224 + i] += xc_pow[32 + i] * yc_pow[i] * SYY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SYY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYYZ; - phi_tmp[288 + i] += 3.0 * xc_pow[32 + i] * zc_pow[i] * SYY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYYZ; - phi_tmp[320 + i] += 2.0 * 4.0 * xc_pow[i] * yc_pow[32 + i] * SYZ; - phi_tmp[320 + i] += 12.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYYZ; - phi_tmp[352 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[352 + i] += xc_pow[i] * yc_pow[32 + i] * SYY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[352 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[352 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYYZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SYY; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[384 + i] += 2.0 * 4.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[416 + i] += 2.0 * xc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[416 + i] += 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SYY; - phi_tmp[416 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYYZ; - phi_tmp[448 + i] += 4.0 * xc_pow[i] * zc_pow[32 + i] * SYY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYYZ; - phi_tmp[480 + i] += 2.0 * 5.0 * xc[i] * yc_pow[64 + i] * SYZ; - phi_tmp[480 + i] += 20.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYYZ; - phi_tmp[512 + i] += 2.0 * 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[512 + i] += xc[i] * yc_pow[64 + i] * SYY; - phi_tmp[512 + i] += 12.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[512 + i] += 2.0 * 4.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[512 + i] += 12.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYYZ; - phi_tmp[544 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SYY; - phi_tmp[544 + i] += 6.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[544 + i] += 2.0 * 6.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[544 + i] += 12.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYYZ; - phi_tmp[576 + i] += 2.0 * 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[576 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SYY; - phi_tmp[576 + i] += 2.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[576 + i] += 2.0 * 6.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[576 + i] += 6.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYYZ; - phi_tmp[608 + i] += 2.0 * xc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[608 + i] += 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SYY; - phi_tmp[608 + i] += 2.0 * 4.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYYZ; - phi_tmp[640 + i] += 5.0 * xc[i] * zc_pow[64 + i] * SYY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYYZ; - phi_tmp[672 + i] += 2.0 * 6.0 * yc_pow[96 + i] * SYZ; - phi_tmp[672 + i] += 30.0 * yc_pow[64 + i] * SZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYYZ; - phi_tmp[704 + i] += 2.0 * 5.0 * yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[704 + i] += yc_pow[96 + i] * SYY; - phi_tmp[704 + i] += 20.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[704 + i] += 2.0 * 5.0 * yc_pow[64 + i] * SY; - phi_tmp[704 + i] += 20.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYYZ; - phi_tmp[736 + i] += 2.0 * 4.0 * yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * zc[i] * SYY; - phi_tmp[736 + i] += 12.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[736 + i] += 2.0 * 8.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[736 + i] += 24.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYYZ; - phi_tmp[768 + i] += 2.0 * 3.0 * yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[768 + i] += 3.0 * yc_pow[32 + i] * zc_pow[i] * SYY; - phi_tmp[768 + i] += 6.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[768 + i] += 2.0 * 9.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[768 + i] += 18.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYYZ; - phi_tmp[800 + i] += 2.0 * 2.0 * yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[800 + i] += 4.0 * yc_pow[i] * zc_pow[32 + i] * SYY; - phi_tmp[800 + i] += 2.0 * zc_pow[64 + i] * SZ; - phi_tmp[800 + i] += 2.0 * 8.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[800 + i] += 8.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYYZ; - phi_tmp[832 + i] += 2.0 * zc_pow[96 + i] * SYZ; - phi_tmp[832 + i] += 5.0 * yc[i] * zc_pow[64 + i] * SYY; - phi_tmp[832 + i] += 2.0 * 5.0 * zc_pow[64 + i] * SY; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYYZ; - phi_tmp[864 + i] += 6.0 * zc_pow[96 + i] * SYY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yyz_out + start), npoints); - } - - // Combine YZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SY = S1[i] * yc[i]; - const double SZ = S1[i] * zc[i]; - const double SYZ = S2[i] * yc[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SYZZ = S3[i] * yc[i] * zc[i] * zc[i] + S2[i] * yc[i]; - - phi_tmp[i] = xc_pow[128 + i] * SYZZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SYZZ; - phi_tmp[32 + i] += xc_pow[96 + i] * SZZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SYZZ; - phi_tmp[64 + i] += 2.0 * xc_pow[96 + i] * SYZ; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SYZZ; - phi_tmp[96 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SZZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SYZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * yc[i] * SYZ; - phi_tmp[128 + i] += xc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[128 + i] += 2.0 * xc_pow[64 + i] * SZ; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SYZZ; - phi_tmp[160 + i] += 2.0 * 2.0 * xc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[160 + i] += 2.0 * xc_pow[64 + i] * SY; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SYZZ; - phi_tmp[192 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SZZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SYZZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc_pow[i] * SYZ; - phi_tmp[224 + i] += 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[224 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SYZZ; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SYZ; - phi_tmp[256 + i] += xc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * yc[i] * SY; - phi_tmp[256 + i] += 2.0 * 2.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[256 + i] += 2.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SYZZ; - phi_tmp[288 + i] += 2.0 * 3.0 * xc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * zc[i] * SY; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SYZZ; - phi_tmp[320 + i] += 4.0 * xc_pow[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SYZZ; - phi_tmp[352 + i] += 2.0 * xc_pow[i] * yc_pow[32 + i] * SYZ; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[352 + i] += 2.0 * 3.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SYZZ; - phi_tmp[384 + i] += 2.0 * 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SYZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[384 + i] += 2.0 * xc_pow[i] * yc_pow[i] * SY; - phi_tmp[384 + i] += 2.0 * 4.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[384 + i] += 4.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[416 + i] += 2.0 * 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SYZ; - phi_tmp[416 + i] += xc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * zc[i] * SY; - phi_tmp[416 + i] += 2.0 * 3.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SYZZ; - phi_tmp[448 + i] += 2.0 * 4.0 * xc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[448 + i] += 12.0 * xc_pow[i] * zc_pow[i] * SY; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SYZZ; - phi_tmp[480 + i] += 5.0 * xc[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SYZZ; - phi_tmp[512 + i] += 2.0 * xc[i] * yc_pow[64 + i] * SYZ; - phi_tmp[512 + i] += 4.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[512 + i] += 2.0 * 4.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SYZZ; - phi_tmp[544 + i] += 2.0 * 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SYZ; - phi_tmp[544 + i] += 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[544 + i] += 2.0 * xc[i] * yc_pow[32 + i] * SY; - phi_tmp[544 + i] += 2.0 * 6.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[544 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SYZZ; - phi_tmp[576 + i] += 2.0 * 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SYZ; - phi_tmp[576 + i] += 2.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * zc[i] * SY; - phi_tmp[576 + i] += 2.0 * 6.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[576 + i] += 12.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SYZZ; - phi_tmp[608 + i] += 2.0 * 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SYZ; - phi_tmp[608 + i] += xc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[608 + i] += 12.0 * xc[i] * yc[i] * zc_pow[i] * SY; - phi_tmp[608 + i] += 2.0 * 4.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[608 + i] += 12.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SYZZ; - phi_tmp[640 + i] += 2.0 * 5.0 * xc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[640 + i] += 20.0 * xc[i] * zc_pow[32 + i] * SY; - - phi_tmp[672 + i] = yc_pow[128 + i] * SYZZ; - phi_tmp[672 + i] += 6.0 * yc_pow[96 + i] * SZZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SYZZ; - phi_tmp[704 + i] += 2.0 * yc_pow[96 + i] * SYZ; - phi_tmp[704 + i] += 5.0 * yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[704 + i] += 2.0 * 5.0 * yc_pow[64 + i] * SZ; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SYZZ; - phi_tmp[736 + i] += 2.0 * 2.0 * yc_pow[64 + i] * zc[i] * SYZ; - phi_tmp[736 + i] += 4.0 * yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[736 + i] += 2.0 * yc_pow[64 + i] * SY; - phi_tmp[736 + i] += 2.0 * 8.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[736 + i] += 8.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SYZZ; - phi_tmp[768 + i] += 2.0 * 3.0 * yc_pow[32 + i] * zc_pow[i] * SYZ; - phi_tmp[768 + i] += 3.0 * yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * zc[i] * SY; - phi_tmp[768 + i] += 2.0 * 9.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[768 + i] += 18.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SYZZ; - phi_tmp[800 + i] += 2.0 * 4.0 * yc_pow[i] * zc_pow[32 + i] * SYZ; - phi_tmp[800 + i] += 2.0 * yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[800 + i] += 12.0 * yc_pow[i] * zc_pow[i] * SY; - phi_tmp[800 + i] += 2.0 * 8.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[800 + i] += 24.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SYZZ; - phi_tmp[832 + i] += 2.0 * 5.0 * yc[i] * zc_pow[64 + i] * SYZ; - phi_tmp[832 + i] += zc_pow[96 + i] * SZZ; - phi_tmp[832 + i] += 20.0 * yc[i] * zc_pow[32 + i] * SY; - phi_tmp[832 + i] += 2.0 * 5.0 * zc_pow[64 + i] * SZ; - phi_tmp[832 + i] += 20.0 * zc_pow[32 + i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SYZZ; - phi_tmp[864 + i] += 2.0 * 6.0 * zc_pow[96 + i] * SYZ; - phi_tmp[864 + i] += 30.0 * zc_pow[64 + i] * SY; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_yzz_out + start), npoints); - } - - // Combine ZZZ blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double SZ = S1[i] * zc[i]; - const double SZZ = S2[i] * zc[i] * zc[i] + S1[i]; - const double SZZZ = S3[i] * zc[i] * zc[i] * zc[i] + 3 * zc[i] * S2[i]; - - phi_tmp[i] = xc_pow[128 + i] * SZZZ; - - phi_tmp[32 + i] = xc_pow[96 + i] * yc[i] * SZZZ; - - phi_tmp[64 + i] = xc_pow[96 + i] * zc[i] * SZZZ; - phi_tmp[64 + i] += 3.0 * xc_pow[96 + i] * SZZ; - - phi_tmp[96 + i] = xc_pow[64 + i] * yc_pow[i] * SZZZ; - - phi_tmp[128 + i] = xc_pow[64 + i] * yc[i] * zc[i] * SZZZ; - phi_tmp[128 + i] += 3.0 * xc_pow[64 + i] * yc[i] * SZZ; - - phi_tmp[160 + i] = xc_pow[64 + i] * zc_pow[i] * SZZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[160 + i] += 3.0 * 2.0 * xc_pow[64 + i] * SZ; - - phi_tmp[192 + i] = xc_pow[32 + i] * yc_pow[32 + i] * SZZZ; - - phi_tmp[224 + i] = xc_pow[32 + i] * yc_pow[i] * zc[i] * SZZZ; - phi_tmp[224 + i] += 3.0 * xc_pow[32 + i] * yc_pow[i] * SZZ; - - phi_tmp[256 + i] = xc_pow[32 + i] * yc[i] * zc_pow[i] * SZZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * xc_pow[32 + i] * yc[i] * zc[i] * SZZ; - phi_tmp[256 + i] += 3.0 * 2.0 * xc_pow[32 + i] * yc[i] * SZ; - - phi_tmp[288 + i] = xc_pow[32 + i] * zc_pow[32 + i] * SZZZ; - phi_tmp[288 + i] += 3.0 * 3.0 * xc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[288 + i] += 3.0 * 6.0 * xc_pow[32 + i] * zc[i] * SZ; - phi_tmp[288 + i] += 6.0 * xc_pow[32 + i] * S0[i]; - - phi_tmp[320 + i] = xc_pow[i] * yc_pow[64 + i] * SZZZ; - - phi_tmp[352 + i] = xc_pow[i] * yc_pow[32 + i] * zc[i] * SZZZ; - phi_tmp[352 + i] += 3.0 * xc_pow[i] * yc_pow[32 + i] * SZZ; - - phi_tmp[384 + i] = xc_pow[i] * yc_pow[i] * zc_pow[i] * SZZZ; - phi_tmp[384 + i] += 3.0 * 2.0 * xc_pow[i] * yc_pow[i] * zc[i] * SZZ; - phi_tmp[384 + i] += 3.0 * 2.0 * xc_pow[i] * yc_pow[i] * SZ; - - phi_tmp[416 + i] = xc_pow[i] * yc[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[416 + i] += 3.0 * 3.0 * xc_pow[i] * yc[i] * zc_pow[i] * SZZ; - phi_tmp[416 + i] += 3.0 * 6.0 * xc_pow[i] * yc[i] * zc[i] * SZ; - phi_tmp[416 + i] += 6.0 * xc_pow[i] * yc[i] * S0[i]; - - phi_tmp[448 + i] = xc_pow[i] * zc_pow[64 + i] * SZZZ; - phi_tmp[448 + i] += 3.0 * 4.0 * xc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[448 + i] += 3.0 * 12.0 * xc_pow[i] * zc_pow[i] * SZ; - phi_tmp[448 + i] += 24.0 * xc_pow[i] * zc[i] * S0[i]; - - phi_tmp[480 + i] = xc[i] * yc_pow[96 + i] * SZZZ; - - phi_tmp[512 + i] = xc[i] * yc_pow[64 + i] * zc[i] * SZZZ; - phi_tmp[512 + i] += 3.0 * xc[i] * yc_pow[64 + i] * SZZ; - - phi_tmp[544 + i] = xc[i] * yc_pow[32 + i] * zc_pow[i] * SZZZ; - phi_tmp[544 + i] += 3.0 * 2.0 * xc[i] * yc_pow[32 + i] * zc[i] * SZZ; - phi_tmp[544 + i] += 3.0 * 2.0 * xc[i] * yc_pow[32 + i] * SZ; - - phi_tmp[576 + i] = xc[i] * yc_pow[i] * zc_pow[32 + i] * SZZZ; - phi_tmp[576 + i] += 3.0 * 3.0 * xc[i] * yc_pow[i] * zc_pow[i] * SZZ; - phi_tmp[576 + i] += 3.0 * 6.0 * xc[i] * yc_pow[i] * zc[i] * SZ; - phi_tmp[576 + i] += 6.0 * xc[i] * yc_pow[i] * S0[i]; - - phi_tmp[608 + i] = xc[i] * yc[i] * zc_pow[64 + i] * SZZZ; - phi_tmp[608 + i] += 3.0 * 4.0 * xc[i] * yc[i] * zc_pow[32 + i] * SZZ; - phi_tmp[608 + i] += 3.0 * 12.0 * xc[i] * yc[i] * zc_pow[i] * SZ; - phi_tmp[608 + i] += 24.0 * xc[i] * yc[i] * zc[i] * S0[i]; - - phi_tmp[640 + i] = xc[i] * zc_pow[96 + i] * SZZZ; - phi_tmp[640 + i] += 3.0 * 5.0 * xc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[640 + i] += 3.0 * 20.0 * xc[i] * zc_pow[32 + i] * SZ; - phi_tmp[640 + i] += 60.0 * xc[i] * zc_pow[i] * S0[i]; - - phi_tmp[672 + i] = yc_pow[128 + i] * SZZZ; - - phi_tmp[704 + i] = yc_pow[96 + i] * zc[i] * SZZZ; - phi_tmp[704 + i] += 3.0 * yc_pow[96 + i] * SZZ; - - phi_tmp[736 + i] = yc_pow[64 + i] * zc_pow[i] * SZZZ; - phi_tmp[736 + i] += 3.0 * 2.0 * yc_pow[64 + i] * zc[i] * SZZ; - phi_tmp[736 + i] += 3.0 * 2.0 * yc_pow[64 + i] * SZ; - - phi_tmp[768 + i] = yc_pow[32 + i] * zc_pow[32 + i] * SZZZ; - phi_tmp[768 + i] += 3.0 * 3.0 * yc_pow[32 + i] * zc_pow[i] * SZZ; - phi_tmp[768 + i] += 3.0 * 6.0 * yc_pow[32 + i] * zc[i] * SZ; - phi_tmp[768 + i] += 6.0 * yc_pow[32 + i] * S0[i]; - - phi_tmp[800 + i] = yc_pow[i] * zc_pow[64 + i] * SZZZ; - phi_tmp[800 + i] += 3.0 * 4.0 * yc_pow[i] * zc_pow[32 + i] * SZZ; - phi_tmp[800 + i] += 3.0 * 12.0 * yc_pow[i] * zc_pow[i] * SZ; - phi_tmp[800 + i] += 24.0 * yc_pow[i] * zc[i] * S0[i]; - - phi_tmp[832 + i] = yc[i] * zc_pow[96 + i] * SZZZ; - phi_tmp[832 + i] += 3.0 * 5.0 * yc[i] * zc_pow[64 + i] * SZZ; - phi_tmp[832 + i] += 3.0 * 20.0 * yc[i] * zc_pow[32 + i] * SZ; - phi_tmp[832 + i] += 60.0 * yc[i] * zc_pow[i] * S0[i]; - - phi_tmp[864 + i] = zc_pow[128 + i] * SZZZ; - phi_tmp[864 + i] += 3.0 * 6.0 * zc_pow[96 + i] * SZZ; - phi_tmp[864 + i] += 3.0 * 30.0 * zc_pow[64 + i] * SZ; - phi_tmp[864 + i] += 120.0 * zc_pow[32 + i] * S0[i]; - - } - - if (order == GG_SPHERICAL_CCA) { - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_zzz_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - ALIGNED_FREE(expn2); - - // Free Power temporaries - ALIGNED_FREE(xc_pow); - ALIGNED_FREE(yc_pow); - ALIGNED_FREE(zc_pow); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_helper.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_helper.c deleted file mode 100644 index e5868df..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_helper.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - -// Information helpers -int gg_max_L() { return 6; } - -int gg_ncomponents(const int L, const int spherical) { - if (spherical) { - return 2 * L + 1; - } else { - return (L + 2) * (L + 1) / 2; - } -} - -// Collocation selector functions -void gg_orbitals(int L, const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - // Chooses the correct function for a given L - if (L == 0) { - gg_orbitals_L0(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else if (L == 1) { - gg_orbitals_L1(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else if (L == 2) { - gg_orbitals_L2(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else if (L == 3) { - gg_orbitals_L3(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else if (L == 4) { - gg_orbitals_L4(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else if (L == 5) { - gg_orbitals_L5(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else if (L == 6) { - gg_orbitals_L6(C, norbitals, npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, orbital_out); - } else { - exit(0); - } -} -void gg_collocation(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - // Chooses the correct function for a given L - if (L == 0) { - gg_collocation_L0(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else if (L == 1) { - gg_collocation_L1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else if (L == 2) { - gg_collocation_L2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else if (L == 3) { - gg_collocation_L3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else if (L == 4) { - gg_collocation_L4(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else if (L == 5) { - gg_collocation_L5(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else if (L == 6) { - gg_collocation_L6(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out); - } else { - exit(0); - } -} -void gg_collocation_deriv1(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out) { - // Chooses the correct function for a given L - if (L == 0) { - gg_collocation_L0_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else if (L == 1) { - gg_collocation_L1_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else if (L == 2) { - gg_collocation_L2_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else if (L == 3) { - gg_collocation_L3_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else if (L == 4) { - gg_collocation_L4_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else if (L == 5) { - gg_collocation_L5_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else if (L == 6) { - gg_collocation_L6_deriv1(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out); - } else { - exit(0); - } -} -void gg_collocation_deriv2(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out) { - // Chooses the correct function for a given L - if (L == 0) { - gg_collocation_L0_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else if (L == 1) { - gg_collocation_L1_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else if (L == 2) { - gg_collocation_L2_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else if (L == 3) { - gg_collocation_L3_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else if (L == 4) { - gg_collocation_L4_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else if (L == 5) { - gg_collocation_L5_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else if (L == 6) { - gg_collocation_L6_deriv2(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out); - } else { - exit(0); - } -} -void gg_collocation_deriv3(int L, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out, double* PRAGMA_RESTRICT phi_x_out, double* PRAGMA_RESTRICT phi_y_out, double* PRAGMA_RESTRICT phi_z_out, double* PRAGMA_RESTRICT phi_xx_out, double* PRAGMA_RESTRICT phi_xy_out, double* PRAGMA_RESTRICT phi_xz_out, double* PRAGMA_RESTRICT phi_yy_out, double* PRAGMA_RESTRICT phi_yz_out, double* PRAGMA_RESTRICT phi_zz_out, double* PRAGMA_RESTRICT phi_xxx_out, double* PRAGMA_RESTRICT phi_xxy_out, double* PRAGMA_RESTRICT phi_xxz_out, double* PRAGMA_RESTRICT phi_xyy_out, double* PRAGMA_RESTRICT phi_xyz_out, double* PRAGMA_RESTRICT phi_xzz_out, double* PRAGMA_RESTRICT phi_yyy_out, double* PRAGMA_RESTRICT phi_yyz_out, double* PRAGMA_RESTRICT phi_yzz_out, double* PRAGMA_RESTRICT phi_zzz_out) { - // Chooses the correct function for a given L - if (L == 0) { - gg_collocation_L0_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else if (L == 1) { - gg_collocation_L1_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else if (L == 2) { - gg_collocation_L2_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else if (L == 3) { - gg_collocation_L3_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else if (L == 4) { - gg_collocation_L4_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else if (L == 5) { - gg_collocation_L5_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else if (L == 6) { - gg_collocation_L6_deriv3(npoints, xyz, xyz_stride, nprim, coeffs, exponents, center, order, phi_out, phi_x_out, phi_y_out, phi_z_out, phi_xx_out, phi_xy_out, phi_xz_out, phi_yy_out, phi_yz_out, phi_zz_out, phi_xxx_out, phi_xxy_out, phi_xxz_out, phi_xyy_out, phi_xyz_out, phi_xzz_out, phi_yyy_out, phi_yyz_out, phi_yzz_out, phi_zzz_out); - } else { - exit(0); - } -} \ No newline at end of file diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_orbital.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_orbital.c deleted file mode 100644 index f9cc618..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_orbital.c +++ /dev/null @@ -1,1439 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - - - -void gg_orbitals_L0(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 1; - const unsigned long nspherical = 1; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Density AM=0 Component=0 - phi_tmp[i] = S0[i]; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L0(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L0(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L0(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L0(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_orbitals_L1(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 3; - const unsigned long nspherical = 3; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Density AM=1 Component=X - phi_tmp[i] = S0[i] * xc[i]; - - // Density AM=1 Component=Y - phi_tmp[32 + i] = S0[i] * yc[i]; - - // Density AM=1 Component=Z - phi_tmp[64 + i] = S0[i] * zc[i]; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L1(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L1(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L1(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L1(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_orbitals_L2(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 6; - const unsigned long nspherical = 5; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - - // Density AM=2 Component=XX - phi_tmp[i] = S0[i] * xc_pow2; - - // Density AM=2 Component=XY - A = xc[i] * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=2 Component=XZ - A = xc[i] * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=2 Component=YY - phi_tmp[96 + i] = S0[i] * yc_pow2; - - // Density AM=2 Component=YZ - A = yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=2 Component=ZZ - phi_tmp[160 + i] = S0[i] * zc_pow2; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L2(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L2(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L2(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L2(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_orbitals_L3(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 10; - const unsigned long nspherical = 7; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - - // Density AM=3 Component=XXX - phi_tmp[i] = S0[i] * xc_pow3; - - // Density AM=3 Component=XXY - A = xc_pow2 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=3 Component=XXZ - A = xc_pow2 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=3 Component=XYY - A = xc[i] * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=3 Component=XYZ - A = xc[i] * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=3 Component=XZZ - A = xc[i] * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=3 Component=YYY - phi_tmp[192 + i] = S0[i] * yc_pow3; - - // Density AM=3 Component=YYZ - A = yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=3 Component=YZZ - A = yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=3 Component=ZZZ - phi_tmp[288 + i] = S0[i] * zc_pow3; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L3(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L3(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L3(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L3(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_orbitals_L4(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 15; - const unsigned long nspherical = 9; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - - // Density AM=4 Component=XXXX - phi_tmp[i] = S0[i] * xc_pow4; - - // Density AM=4 Component=XXXY - A = xc_pow3 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=4 Component=XXXZ - A = xc_pow3 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=4 Component=XXYY - A = xc_pow2 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=4 Component=XXYZ - A = xc_pow2 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=4 Component=XXZZ - A = xc_pow2 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=4 Component=XYYY - A = xc[i] * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Density AM=4 Component=XYYZ - A = xc[i] * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=4 Component=XYZZ - A = xc[i] * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=4 Component=XZZZ - A = xc[i] * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Density AM=4 Component=YYYY - phi_tmp[320 + i] = S0[i] * yc_pow4; - - // Density AM=4 Component=YYYZ - A = yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Density AM=4 Component=YYZZ - A = yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Density AM=4 Component=YZZZ - A = yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Density AM=4 Component=ZZZZ - phi_tmp[448 + i] = S0[i] * zc_pow4; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L4(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L4(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L4(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L4(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_orbitals_L5(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 21; - const unsigned long nspherical = 11; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - const double xc_pow5 = xc_pow4 * xc[i]; - const double yc_pow5 = yc_pow4 * yc[i]; - const double zc_pow5 = zc_pow4 * zc[i]; - - - // Density AM=5 Component=XXXXX - phi_tmp[i] = S0[i] * xc_pow5; - - // Density AM=5 Component=XXXXY - A = xc_pow4 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=5 Component=XXXXZ - A = xc_pow4 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=5 Component=XXXYY - A = xc_pow3 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=5 Component=XXXYZ - A = xc_pow3 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=5 Component=XXXZZ - A = xc_pow3 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=5 Component=XXYYY - A = xc_pow2 * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Density AM=5 Component=XXYYZ - A = xc_pow2 * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=5 Component=XXYZZ - A = xc_pow2 * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=5 Component=XXZZZ - A = xc_pow2 * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Density AM=5 Component=XYYYY - A = xc[i] * yc_pow4; - phi_tmp[320 + i] = S0[i] * A; - - // Density AM=5 Component=XYYYZ - A = xc[i] * yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Density AM=5 Component=XYYZZ - A = xc[i] * yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Density AM=5 Component=XYZZZ - A = xc[i] * yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Density AM=5 Component=XZZZZ - A = xc[i] * zc_pow4; - phi_tmp[448 + i] = S0[i] * A; - - // Density AM=5 Component=YYYYY - phi_tmp[480 + i] = S0[i] * yc_pow5; - - // Density AM=5 Component=YYYYZ - A = yc_pow4 * zc[i]; - phi_tmp[512 + i] = S0[i] * A; - - // Density AM=5 Component=YYYZZ - A = yc_pow3 * zc_pow2; - phi_tmp[544 + i] = S0[i] * A; - - // Density AM=5 Component=YYZZZ - A = yc_pow2 * zc_pow3; - phi_tmp[576 + i] = S0[i] * A; - - // Density AM=5 Component=YZZZZ - A = yc[i] * zc_pow4; - phi_tmp[608 + i] = S0[i] * A; - - // Density AM=5 Component=ZZZZZ - phi_tmp[640 + i] = S0[i] * zc_pow5; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L5(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L5(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L5(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L5(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_orbitals_L6(const double* PRAGMA_RESTRICT C, const unsigned long norbitals, const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT orbital_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 28; - const unsigned long nspherical = 13; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - const double xc_pow5 = xc_pow4 * xc[i]; - const double yc_pow5 = yc_pow4 * yc[i]; - const double zc_pow5 = zc_pow4 * zc[i]; - - const double xc_pow6 = xc_pow5 * xc[i]; - const double yc_pow6 = yc_pow5 * yc[i]; - const double zc_pow6 = zc_pow5 * zc[i]; - - - // Density AM=6 Component=XXXXXX - phi_tmp[i] = S0[i] * xc_pow6; - - // Density AM=6 Component=XXXXXY - A = xc_pow5 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXXZ - A = xc_pow5 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXYY - A = xc_pow4 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXYZ - A = xc_pow4 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXZZ - A = xc_pow4 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=6 Component=XXXYYY - A = xc_pow3 * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Density AM=6 Component=XXXYYZ - A = xc_pow3 * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=6 Component=XXXYZZ - A = xc_pow3 * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=6 Component=XXXZZZ - A = xc_pow3 * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Density AM=6 Component=XXYYYY - A = xc_pow2 * yc_pow4; - phi_tmp[320 + i] = S0[i] * A; - - // Density AM=6 Component=XXYYYZ - A = xc_pow2 * yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Density AM=6 Component=XXYYZZ - A = xc_pow2 * yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Density AM=6 Component=XXYZZZ - A = xc_pow2 * yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Density AM=6 Component=XXZZZZ - A = xc_pow2 * zc_pow4; - phi_tmp[448 + i] = S0[i] * A; - - // Density AM=6 Component=XYYYYY - A = xc[i] * yc_pow5; - phi_tmp[480 + i] = S0[i] * A; - - // Density AM=6 Component=XYYYYZ - A = xc[i] * yc_pow4 * zc[i]; - phi_tmp[512 + i] = S0[i] * A; - - // Density AM=6 Component=XYYYZZ - A = xc[i] * yc_pow3 * zc_pow2; - phi_tmp[544 + i] = S0[i] * A; - - // Density AM=6 Component=XYYZZZ - A = xc[i] * yc_pow2 * zc_pow3; - phi_tmp[576 + i] = S0[i] * A; - - // Density AM=6 Component=XYZZZZ - A = xc[i] * yc[i] * zc_pow4; - phi_tmp[608 + i] = S0[i] * A; - - // Density AM=6 Component=XZZZZZ - A = xc[i] * zc_pow5; - phi_tmp[640 + i] = S0[i] * A; - - // Density AM=6 Component=YYYYYY - phi_tmp[672 + i] = S0[i] * yc_pow6; - - // Density AM=6 Component=YYYYYZ - A = yc_pow5 * zc[i]; - phi_tmp[704 + i] = S0[i] * A; - - // Density AM=6 Component=YYYYZZ - A = yc_pow4 * zc_pow2; - phi_tmp[736 + i] = S0[i] * A; - - // Density AM=6 Component=YYYZZZ - A = yc_pow3 * zc_pow3; - phi_tmp[768 + i] = S0[i] * A; - - // Density AM=6 Component=YYZZZZ - A = yc_pow2 * zc_pow4; - phi_tmp[800 + i] = S0[i] * A; - - // Density AM=6 Component=YZZZZZ - A = yc[i] * zc_pow5; - phi_tmp[832 + i] = S0[i] * A; - - // Density AM=6 Component=ZZZZZZ - phi_tmp[864 + i] = S0[i] * zc_pow6; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_to_spherical_sum_L6(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_gaussian_cart_to_spherical_sum_L6(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_cca_cart_sum_L6(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - for (unsigned long i = 0; i < norbitals; i++) { - gg_molden_cart_sum_L6(remain, (C + i * nout), phi_tmp, 32, (orbital_out + npoints * i + start), npoints); - } - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_phi.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_phi.c deleted file mode 100644 index 77ece95..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_phi.c +++ /dev/null @@ -1,1371 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - - - -void gg_collocation_L0(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 1; - const unsigned long nspherical = 1; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 32 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - phi_out[start + i] = S0[i]; - } - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L1(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 3; - const unsigned long nspherical = 3; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 96 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Density AM=1 Component=X - phi_tmp[i] = S0[i] * xc[i]; - - // Density AM=1 Component=Y - phi_tmp[32 + i] = S0[i] * yc[i]; - - // Density AM=1 Component=Z - phi_tmp[64 + i] = S0[i] * zc[i]; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L1(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L2(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 6; - const unsigned long nspherical = 5; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - - // Density AM=2 Component=XX - phi_tmp[i] = S0[i] * xc_pow2; - - // Density AM=2 Component=XY - A = xc[i] * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=2 Component=XZ - A = xc[i] * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=2 Component=YY - phi_tmp[96 + i] = S0[i] * yc_pow2; - - // Density AM=2 Component=YZ - A = yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=2 Component=ZZ - phi_tmp[160 + i] = S0[i] * zc_pow2; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L2(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L3(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 10; - const unsigned long nspherical = 7; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 320 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - - // Density AM=3 Component=XXX - phi_tmp[i] = S0[i] * xc_pow3; - - // Density AM=3 Component=XXY - A = xc_pow2 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=3 Component=XXZ - A = xc_pow2 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=3 Component=XYY - A = xc[i] * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=3 Component=XYZ - A = xc[i] * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=3 Component=XZZ - A = xc[i] * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=3 Component=YYY - phi_tmp[192 + i] = S0[i] * yc_pow3; - - // Density AM=3 Component=YYZ - A = yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=3 Component=YZZ - A = yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=3 Component=ZZZ - phi_tmp[288 + i] = S0[i] * zc_pow3; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L3(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L4(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 15; - const unsigned long nspherical = 9; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 480 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - - // Density AM=4 Component=XXXX - phi_tmp[i] = S0[i] * xc_pow4; - - // Density AM=4 Component=XXXY - A = xc_pow3 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=4 Component=XXXZ - A = xc_pow3 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=4 Component=XXYY - A = xc_pow2 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=4 Component=XXYZ - A = xc_pow2 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=4 Component=XXZZ - A = xc_pow2 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=4 Component=XYYY - A = xc[i] * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Density AM=4 Component=XYYZ - A = xc[i] * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=4 Component=XYZZ - A = xc[i] * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=4 Component=XZZZ - A = xc[i] * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Density AM=4 Component=YYYY - phi_tmp[320 + i] = S0[i] * yc_pow4; - - // Density AM=4 Component=YYYZ - A = yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Density AM=4 Component=YYZZ - A = yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Density AM=4 Component=YZZZ - A = yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Density AM=4 Component=ZZZZ - phi_tmp[448 + i] = S0[i] * zc_pow4; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L4(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L5(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 21; - const unsigned long nspherical = 11; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 672 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - const double xc_pow5 = xc_pow4 * xc[i]; - const double yc_pow5 = yc_pow4 * yc[i]; - const double zc_pow5 = zc_pow4 * zc[i]; - - - // Density AM=5 Component=XXXXX - phi_tmp[i] = S0[i] * xc_pow5; - - // Density AM=5 Component=XXXXY - A = xc_pow4 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=5 Component=XXXXZ - A = xc_pow4 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=5 Component=XXXYY - A = xc_pow3 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=5 Component=XXXYZ - A = xc_pow3 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=5 Component=XXXZZ - A = xc_pow3 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=5 Component=XXYYY - A = xc_pow2 * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Density AM=5 Component=XXYYZ - A = xc_pow2 * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=5 Component=XXYZZ - A = xc_pow2 * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=5 Component=XXZZZ - A = xc_pow2 * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Density AM=5 Component=XYYYY - A = xc[i] * yc_pow4; - phi_tmp[320 + i] = S0[i] * A; - - // Density AM=5 Component=XYYYZ - A = xc[i] * yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Density AM=5 Component=XYYZZ - A = xc[i] * yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Density AM=5 Component=XYZZZ - A = xc[i] * yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Density AM=5 Component=XZZZZ - A = xc[i] * zc_pow4; - phi_tmp[448 + i] = S0[i] * A; - - // Density AM=5 Component=YYYYY - phi_tmp[480 + i] = S0[i] * yc_pow5; - - // Density AM=5 Component=YYYYZ - A = yc_pow4 * zc[i]; - phi_tmp[512 + i] = S0[i] * A; - - // Density AM=5 Component=YYYZZ - A = yc_pow3 * zc_pow2; - phi_tmp[544 + i] = S0[i] * A; - - // Density AM=5 Component=YYZZZ - A = yc_pow2 * zc_pow3; - phi_tmp[576 + i] = S0[i] * A; - - // Density AM=5 Component=YZZZZ - A = yc[i] * zc_pow4; - phi_tmp[608 + i] = S0[i] * A; - - // Density AM=5 Component=ZZZZZ - phi_tmp[640 + i] = S0[i] * zc_pow5; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L5(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} - -void gg_collocation_L6(const unsigned long npoints, const double* PRAGMA_RESTRICT xyz, const unsigned long xyz_stride, const int nprim, const double* PRAGMA_RESTRICT coeffs, const double* PRAGMA_RESTRICT exponents, const double* PRAGMA_RESTRICT center, const int order, double* PRAGMA_RESTRICT phi_out) { - - // Sizing - unsigned long nblocks = npoints / 32; - nblocks += (npoints % 32) ? 1 : 0; - const unsigned long ncart = 28; - const unsigned long nspherical = 13; - unsigned long nout; - - if ((order == GG_SPHERICAL_CCA) || (order == GG_SPHERICAL_GAUSSIAN)) { - nout = nspherical; - } else { - nout = ncart; - } - - // Allocate S temporaries, single block to stay on cache - double* PRAGMA_RESTRICT cache_data = (double*)ALIGNED_MALLOC(64, 192 * sizeof(double)); - double* PRAGMA_RESTRICT xc = cache_data + 0; - ASSUME_ALIGNED(xc, 64); - double* PRAGMA_RESTRICT yc = cache_data + 32; - ASSUME_ALIGNED(yc, 64); - double* PRAGMA_RESTRICT zc = cache_data + 64; - ASSUME_ALIGNED(zc, 64); - double* PRAGMA_RESTRICT R2 = cache_data + 96; - ASSUME_ALIGNED(R2, 64); - double* PRAGMA_RESTRICT S0 = cache_data + 128; - ASSUME_ALIGNED(S0, 64); - double* PRAGMA_RESTRICT tmp1 = cache_data + 160; - ASSUME_ALIGNED(tmp1, 64); - - // Allocate exponential temporaries - double* PRAGMA_RESTRICT expn1 = (double*)ALIGNED_MALLOC(64, nprim * sizeof(double)); - - // Allocate output temporaries - double* PRAGMA_RESTRICT phi_tmp = (double*)ALIGNED_MALLOC(64, 896 * sizeof(double)); - ASSUME_ALIGNED(phi_tmp, 64); - - // Declare doubles - const double center_x = center[0]; - const double center_y = center[1]; - const double center_z = center[2]; - double A; - - // Build negative exponents - for (unsigned long i = 0; i < nprim; i++) { - expn1[i] = -1.0 * exponents[i]; - } - - // Start outer block loop - for (unsigned long block = 0; block < nblocks; block++) { - - - // Copy data into inner temps - const unsigned long start = block * 32; - const unsigned long remain = ((start + 32) > npoints) ? (npoints - start) : 32; - - // Handle non-AM dependant temps - if (xyz_stride == 1) { - const double* PRAGMA_RESTRICT x = xyz + start; - const double* PRAGMA_RESTRICT y = xyz + npoints + start; - const double* PRAGMA_RESTRICT z = xyz + 2 * npoints + start; - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = x[i] - center_x; - yc[i] = y[i] - center_y; - zc[i] = z[i] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } else { - unsigned int start_shift = start * xyz_stride; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - xc[i] = xyz[start_shift + i * xyz_stride] - center_x; - yc[i] = xyz[start_shift + i * xyz_stride + 1] - center_y; - zc[i] = xyz[start_shift + i * xyz_stride + 2] - center_z; - - // Distance - R2[i] = xc[i] * xc[i]; - R2[i] += yc[i] * yc[i]; - R2[i] += zc[i] * zc[i]; - - // Zero out S tmps - S0[i] = 0.0; - } - } - - // Start exponential block loop - for (unsigned long n = 0; n < nprim; n++) { - const double coef = coeffs[n]; - const double alpha_n1 = expn1[n]; - - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - const double width = alpha_n1 * R2[i]; - const double T1 = coef * exp(width); - S0[i] += T1; - } - - } - - // Combine blocks - PRAGMA_VECTORIZE - for (unsigned long i = 0; i < remain; i++) { - - // Cartesian derivs - const double xc_pow2 = xc[i] * xc[i]; - const double yc_pow2 = yc[i] * yc[i]; - const double zc_pow2 = zc[i] * zc[i]; - - const double xc_pow3 = xc_pow2 * xc[i]; - const double yc_pow3 = yc_pow2 * yc[i]; - const double zc_pow3 = zc_pow2 * zc[i]; - - const double xc_pow4 = xc_pow3 * xc[i]; - const double yc_pow4 = yc_pow3 * yc[i]; - const double zc_pow4 = zc_pow3 * zc[i]; - - const double xc_pow5 = xc_pow4 * xc[i]; - const double yc_pow5 = yc_pow4 * yc[i]; - const double zc_pow5 = zc_pow4 * zc[i]; - - const double xc_pow6 = xc_pow5 * xc[i]; - const double yc_pow6 = yc_pow5 * yc[i]; - const double zc_pow6 = zc_pow5 * zc[i]; - - - // Density AM=6 Component=XXXXXX - phi_tmp[i] = S0[i] * xc_pow6; - - // Density AM=6 Component=XXXXXY - A = xc_pow5 * yc[i]; - phi_tmp[32 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXXZ - A = xc_pow5 * zc[i]; - phi_tmp[64 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXYY - A = xc_pow4 * yc_pow2; - phi_tmp[96 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXYZ - A = xc_pow4 * yc[i] * zc[i]; - phi_tmp[128 + i] = S0[i] * A; - - // Density AM=6 Component=XXXXZZ - A = xc_pow4 * zc_pow2; - phi_tmp[160 + i] = S0[i] * A; - - // Density AM=6 Component=XXXYYY - A = xc_pow3 * yc_pow3; - phi_tmp[192 + i] = S0[i] * A; - - // Density AM=6 Component=XXXYYZ - A = xc_pow3 * yc_pow2 * zc[i]; - phi_tmp[224 + i] = S0[i] * A; - - // Density AM=6 Component=XXXYZZ - A = xc_pow3 * yc[i] * zc_pow2; - phi_tmp[256 + i] = S0[i] * A; - - // Density AM=6 Component=XXXZZZ - A = xc_pow3 * zc_pow3; - phi_tmp[288 + i] = S0[i] * A; - - // Density AM=6 Component=XXYYYY - A = xc_pow2 * yc_pow4; - phi_tmp[320 + i] = S0[i] * A; - - // Density AM=6 Component=XXYYYZ - A = xc_pow2 * yc_pow3 * zc[i]; - phi_tmp[352 + i] = S0[i] * A; - - // Density AM=6 Component=XXYYZZ - A = xc_pow2 * yc_pow2 * zc_pow2; - phi_tmp[384 + i] = S0[i] * A; - - // Density AM=6 Component=XXYZZZ - A = xc_pow2 * yc[i] * zc_pow3; - phi_tmp[416 + i] = S0[i] * A; - - // Density AM=6 Component=XXZZZZ - A = xc_pow2 * zc_pow4; - phi_tmp[448 + i] = S0[i] * A; - - // Density AM=6 Component=XYYYYY - A = xc[i] * yc_pow5; - phi_tmp[480 + i] = S0[i] * A; - - // Density AM=6 Component=XYYYYZ - A = xc[i] * yc_pow4 * zc[i]; - phi_tmp[512 + i] = S0[i] * A; - - // Density AM=6 Component=XYYYZZ - A = xc[i] * yc_pow3 * zc_pow2; - phi_tmp[544 + i] = S0[i] * A; - - // Density AM=6 Component=XYYZZZ - A = xc[i] * yc_pow2 * zc_pow3; - phi_tmp[576 + i] = S0[i] * A; - - // Density AM=6 Component=XYZZZZ - A = xc[i] * yc[i] * zc_pow4; - phi_tmp[608 + i] = S0[i] * A; - - // Density AM=6 Component=XZZZZZ - A = xc[i] * zc_pow5; - phi_tmp[640 + i] = S0[i] * A; - - // Density AM=6 Component=YYYYYY - phi_tmp[672 + i] = S0[i] * yc_pow6; - - // Density AM=6 Component=YYYYYZ - A = yc_pow5 * zc[i]; - phi_tmp[704 + i] = S0[i] * A; - - // Density AM=6 Component=YYYYZZ - A = yc_pow4 * zc_pow2; - phi_tmp[736 + i] = S0[i] * A; - - // Density AM=6 Component=YYYZZZ - A = yc_pow3 * zc_pow3; - phi_tmp[768 + i] = S0[i] * A; - - // Density AM=6 Component=YYZZZZ - A = yc_pow2 * zc_pow4; - phi_tmp[800 + i] = S0[i] * A; - - // Density AM=6 Component=YZZZZZ - A = yc[i] * zc_pow5; - phi_tmp[832 + i] = S0[i] * A; - - // Density AM=6 Component=ZZZZZZ - phi_tmp[864 + i] = S0[i] * zc_pow6; - - } - - // Copy data back into outer temps - if (order == GG_SPHERICAL_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_SPHERICAL_GAUSSIAN) { - // Phi, transform data to outer temps - gg_gaussian_cart_to_spherical_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_CCA) { - // Phi, transform data to outer temps - gg_cca_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } else if (order == GG_CARTESIAN_MOLDEN) { - // Phi, transform data to outer temps - gg_molden_cart_copy_L6(remain, phi_tmp, 32, (phi_out + start), npoints); - } - - } - - // Free S temporaries - ALIGNED_FREE(cache_data); - ALIGNED_FREE(expn1); - - // Free inner temporaries - ALIGNED_FREE(phi_tmp); - -} diff --git a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_transform.c b/third_party/gauxc/external/gau2grid/generated_source/gau2grid_transform.c deleted file mode 100644 index 2e1c6f5..0000000 --- a/third_party/gauxc/external/gau2grid/generated_source/gau2grid_transform.c +++ /dev/null @@ -1,3716 +0,0 @@ -/* - * This is a Gau2Grid automatically generated C file. - * - * More details can found at the following repo: - * https://github.com/dgasmith/gau2grid - */ - -#include -#if defined(__clang__) && defined(_MSC_VER) -#include -#elif defined __clang__ -#include -#elif defined _MSC_VER -#include -#else -#include -#endif - -#include "gau2grid/gau2grid.h" -#include "gau2grid/gau2grid_utility.h" -#include "gau2grid/gau2grid_pragma.h" - -void gg_cca_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_00 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = cart[i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L0(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_00 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[i]; - output[i] += tmp * vector[0]; - - } - -} -void gg_cca_cart_to_spherical_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_10 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = cart[ncart + i]; - - } - - // R_11c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = cart[2 * ncart + i]; - - } - // R_11s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = cart[i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L1(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_10 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_11c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[2 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_11s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[i]; - output[i] += tmp * vector[2]; - - } - -} -void gg_cca_cart_to_spherical_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_20 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 1.7320508075688772 * cart[ncart + i]; - - } - - // R_21c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 1.7320508075688772 * cart[4 * ncart + i]; - - } - // R_21s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -0.5000000000000000 * cart[i]; - spherical[2 * nspherical + i] += -0.5000000000000000 * cart[3 * ncart + i]; - spherical[2 * nspherical + i] += cart[5 * ncart + i]; - - } - - // R_22c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = 1.7320508075688772 * cart[2 * ncart + i]; - - } - // R_22s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 0.8660254037844386 * cart[i]; - spherical[4 * nspherical + i] += -0.8660254037844386 * cart[3 * ncart + i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L2(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_20 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.7320508075688772 * cart[ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_21c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.7320508075688772 * cart[4 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_21s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.5000000000000000 * cart[i]; - tmp += -0.5000000000000000 * cart[3 * ncart + i]; - tmp += cart[5 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_22c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.7320508075688772 * cart[2 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_22s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.8660254037844386 * cart[i]; - tmp += -0.8660254037844386 * cart[3 * ncart + i]; - output[i] += tmp * vector[4]; - - } - -} -void gg_cca_cart_to_spherical_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_30 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 2.3717082451262845 * cart[ncart + i]; - spherical[i] += -0.7905694150420949 * cart[6 * ncart + i]; - - } - - // R_31c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 3.8729833462074170 * cart[4 * ncart + i]; - - } - // R_31s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -0.6123724356957945 * cart[ncart + i]; - spherical[2 * nspherical + i] += -0.6123724356957945 * cart[6 * ncart + i]; - spherical[2 * nspherical + i] += 2.4494897427831779 * cart[8 * ncart + i]; - - } - - // R_32c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = -1.5000000000000000 * cart[2 * ncart + i]; - spherical[3 * nspherical + i] += -1.5000000000000000 * cart[7 * ncart + i]; - spherical[3 * nspherical + i] += cart[9 * ncart + i]; - - } - // R_32s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = -0.6123724356957945 * cart[i]; - spherical[4 * nspherical + i] += -0.6123724356957945 * cart[3 * ncart + i]; - spherical[4 * nspherical + i] += 2.4494897427831779 * cart[5 * ncart + i]; - - } - - // R_33c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = 1.9364916731037085 * cart[2 * ncart + i]; - spherical[5 * nspherical + i] += -1.9364916731037085 * cart[7 * ncart + i]; - - } - // R_33s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = 0.7905694150420949 * cart[i]; - spherical[6 * nspherical + i] += -2.3717082451262845 * cart[3 * ncart + i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L3(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_30 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.3717082451262845 * cart[ncart + i]; - tmp += -0.7905694150420949 * cart[6 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_31c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 3.8729833462074170 * cart[4 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_31s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.6123724356957945 * cart[ncart + i]; - tmp += -0.6123724356957945 * cart[6 * ncart + i]; - tmp += 2.4494897427831779 * cart[8 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_32c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.5000000000000000 * cart[2 * ncart + i]; - tmp += -1.5000000000000000 * cart[7 * ncart + i]; - tmp += cart[9 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_32s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.6123724356957945 * cart[i]; - tmp += -0.6123724356957945 * cart[3 * ncart + i]; - tmp += 2.4494897427831779 * cart[5 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_33c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.9364916731037085 * cart[2 * ncart + i]; - tmp += -1.9364916731037085 * cart[7 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_33s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.7905694150420949 * cart[i]; - tmp += -2.3717082451262845 * cart[3 * ncart + i]; - output[i] += tmp * vector[6]; - - } - -} -void gg_cca_cart_to_spherical_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_40 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 2.9580398915498081 * cart[ncart + i]; - spherical[i] += -2.9580398915498081 * cart[6 * ncart + i]; - - } - - // R_41c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 6.2749501990055663 * cart[4 * ncart + i]; - spherical[nspherical + i] += -2.0916500663351889 * cart[11 * ncart + i]; - - } - // R_41s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -1.1180339887498949 * cart[ncart + i]; - spherical[2 * nspherical + i] += -1.1180339887498949 * cart[6 * ncart + i]; - spherical[2 * nspherical + i] += 6.7082039324993694 * cart[8 * ncart + i]; - - } - - // R_42c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = -2.3717082451262845 * cart[4 * ncart + i]; - spherical[3 * nspherical + i] += -2.3717082451262845 * cart[11 * ncart + i]; - spherical[3 * nspherical + i] += 3.1622776601683795 * cart[13 * ncart + i]; - - } - // R_42s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 0.3750000000000000 * cart[i]; - spherical[4 * nspherical + i] += 0.7500000000000000 * cart[3 * ncart + i]; - spherical[4 * nspherical + i] += 0.3750000000000000 * cart[10 * ncart + i]; - spherical[4 * nspherical + i] += -3.0000000000000000 * cart[5 * ncart + i]; - spherical[4 * nspherical + i] += -3.0000000000000000 * cart[12 * ncart + i]; - spherical[4 * nspherical + i] += cart[14 * ncart + i]; - - } - - // R_43c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = -2.3717082451262845 * cart[2 * ncart + i]; - spherical[5 * nspherical + i] += -2.3717082451262845 * cart[7 * ncart + i]; - spherical[5 * nspherical + i] += 3.1622776601683795 * cart[9 * ncart + i]; - - } - // R_43s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = -0.5590169943749475 * cart[i]; - spherical[6 * nspherical + i] += 0.5590169943749475 * cart[10 * ncart + i]; - spherical[6 * nspherical + i] += 3.3541019662496847 * cart[5 * ncart + i]; - spherical[6 * nspherical + i] += -3.3541019662496847 * cart[12 * ncart + i]; - - } - - // R_44c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[7 * nspherical + i] = 2.0916500663351889 * cart[2 * ncart + i]; - spherical[7 * nspherical + i] += -6.2749501990055663 * cart[7 * ncart + i]; - - } - // R_44s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[8 * nspherical + i] = 0.7395099728874520 * cart[i]; - spherical[8 * nspherical + i] += -4.4370598373247123 * cart[3 * ncart + i]; - spherical[8 * nspherical + i] += 0.7395099728874520 * cart[10 * ncart + i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L4(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_40 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.9580398915498081 * cart[ncart + i]; - tmp += -2.9580398915498081 * cart[6 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_41c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 6.2749501990055663 * cart[4 * ncart + i]; - tmp += -2.0916500663351889 * cart[11 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_41s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.1180339887498949 * cart[ncart + i]; - tmp += -1.1180339887498949 * cart[6 * ncart + i]; - tmp += 6.7082039324993694 * cart[8 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_42c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.3717082451262845 * cart[4 * ncart + i]; - tmp += -2.3717082451262845 * cart[11 * ncart + i]; - tmp += 3.1622776601683795 * cart[13 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_42s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.3750000000000000 * cart[i]; - tmp += 0.7500000000000000 * cart[3 * ncart + i]; - tmp += 0.3750000000000000 * cart[10 * ncart + i]; - tmp += -3.0000000000000000 * cart[5 * ncart + i]; - tmp += -3.0000000000000000 * cart[12 * ncart + i]; - tmp += cart[14 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_43c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.3717082451262845 * cart[2 * ncart + i]; - tmp += -2.3717082451262845 * cart[7 * ncart + i]; - tmp += 3.1622776601683795 * cart[9 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_43s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.5590169943749475 * cart[i]; - tmp += 0.5590169943749475 * cart[10 * ncart + i]; - tmp += 3.3541019662496847 * cart[5 * ncart + i]; - tmp += -3.3541019662496847 * cart[12 * ncart + i]; - output[i] += tmp * vector[6]; - - } - - // R_44c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.0916500663351889 * cart[2 * ncart + i]; - tmp += -6.2749501990055663 * cart[7 * ncart + i]; - output[i] += tmp * vector[7]; - - } - // R_44s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.7395099728874520 * cart[i]; - tmp += -4.4370598373247123 * cart[3 * ncart + i]; - tmp += 0.7395099728874520 * cart[10 * ncart + i]; - output[i] += tmp * vector[8]; - - } - -} -void gg_cca_cart_to_spherical_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_50 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 3.5078038001005702 * cart[ncart + i]; - spherical[i] += -7.0156076002011405 * cart[6 * ncart + i]; - spherical[i] += 0.7015607600201140 * cart[15 * ncart + i]; - - } - - // R_51c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 8.8741196746494246 * cart[4 * ncart + i]; - spherical[nspherical + i] += -8.8741196746494246 * cart[11 * ncart + i]; - - } - // R_51s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -1.5687375497513916 * cart[ncart + i]; - spherical[2 * nspherical + i] += -1.0458250331675945 * cart[6 * ncart + i]; - spherical[2 * nspherical + i] += 0.5229125165837972 * cart[15 * ncart + i]; - spherical[2 * nspherical + i] += 12.5499003980111326 * cart[8 * ncart + i]; - spherical[2 * nspherical + i] += -4.1833001326703778 * cart[17 * ncart + i]; - - } - - // R_52c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = -5.1234753829797990 * cart[4 * ncart + i]; - spherical[3 * nspherical + i] += -5.1234753829797990 * cart[11 * ncart + i]; - spherical[3 * nspherical + i] += 10.2469507659595980 * cart[13 * ncart + i]; - - } - // R_52s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 0.4841229182759271 * cart[ncart + i]; - spherical[4 * nspherical + i] += 0.9682458365518543 * cart[6 * ncart + i]; - spherical[4 * nspherical + i] += 0.4841229182759271 * cart[15 * ncart + i]; - spherical[4 * nspherical + i] += -5.8094750193111251 * cart[8 * ncart + i]; - spherical[4 * nspherical + i] += -5.8094750193111251 * cart[17 * ncart + i]; - spherical[4 * nspherical + i] += 3.8729833462074170 * cart[19 * ncart + i]; - - } - - // R_53c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = 1.8750000000000000 * cart[2 * ncart + i]; - spherical[5 * nspherical + i] += 3.7500000000000000 * cart[7 * ncart + i]; - spherical[5 * nspherical + i] += 1.8750000000000000 * cart[16 * ncart + i]; - spherical[5 * nspherical + i] += -5.0000000000000000 * cart[9 * ncart + i]; - spherical[5 * nspherical + i] += -5.0000000000000000 * cart[18 * ncart + i]; - spherical[5 * nspherical + i] += cart[20 * ncart + i]; - - } - // R_53s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = 0.4841229182759271 * cart[i]; - spherical[6 * nspherical + i] += 0.9682458365518543 * cart[3 * ncart + i]; - spherical[6 * nspherical + i] += 0.4841229182759271 * cart[10 * ncart + i]; - spherical[6 * nspherical + i] += -5.8094750193111251 * cart[5 * ncart + i]; - spherical[6 * nspherical + i] += -5.8094750193111251 * cart[12 * ncart + i]; - spherical[6 * nspherical + i] += 3.8729833462074170 * cart[14 * ncart + i]; - - } - - // R_54c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[7 * nspherical + i] = -2.5617376914898995 * cart[2 * ncart + i]; - spherical[7 * nspherical + i] += 2.5617376914898995 * cart[16 * ncart + i]; - spherical[7 * nspherical + i] += 5.1234753829797990 * cart[9 * ncart + i]; - spherical[7 * nspherical + i] += -5.1234753829797990 * cart[18 * ncart + i]; - - } - // R_54s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[8 * nspherical + i] = -0.5229125165837972 * cart[i]; - spherical[8 * nspherical + i] += 1.0458250331675945 * cart[3 * ncart + i]; - spherical[8 * nspherical + i] += 1.5687375497513916 * cart[10 * ncart + i]; - spherical[8 * nspherical + i] += 4.1833001326703778 * cart[5 * ncart + i]; - spherical[8 * nspherical + i] += -12.5499003980111326 * cart[12 * ncart + i]; - - } - - // R_55c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[9 * nspherical + i] = 2.2185299186623562 * cart[2 * ncart + i]; - spherical[9 * nspherical + i] += -13.3111795119741370 * cart[7 * ncart + i]; - spherical[9 * nspherical + i] += 2.2185299186623562 * cart[16 * ncart + i]; - - } - // R_55s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[10 * nspherical + i] = 0.7015607600201140 * cart[i]; - spherical[10 * nspherical + i] += -7.0156076002011405 * cart[3 * ncart + i]; - spherical[10 * nspherical + i] += 3.5078038001005702 * cart[10 * ncart + i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L5(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_50 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 3.5078038001005702 * cart[ncart + i]; - tmp += -7.0156076002011405 * cart[6 * ncart + i]; - tmp += 0.7015607600201140 * cart[15 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_51c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 8.8741196746494246 * cart[4 * ncart + i]; - tmp += -8.8741196746494246 * cart[11 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_51s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.5687375497513916 * cart[ncart + i]; - tmp += -1.0458250331675945 * cart[6 * ncart + i]; - tmp += 0.5229125165837972 * cart[15 * ncart + i]; - tmp += 12.5499003980111326 * cart[8 * ncart + i]; - tmp += -4.1833001326703778 * cart[17 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_52c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -5.1234753829797990 * cart[4 * ncart + i]; - tmp += -5.1234753829797990 * cart[11 * ncart + i]; - tmp += 10.2469507659595980 * cart[13 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_52s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.4841229182759271 * cart[ncart + i]; - tmp += 0.9682458365518543 * cart[6 * ncart + i]; - tmp += 0.4841229182759271 * cart[15 * ncart + i]; - tmp += -5.8094750193111251 * cart[8 * ncart + i]; - tmp += -5.8094750193111251 * cart[17 * ncart + i]; - tmp += 3.8729833462074170 * cart[19 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_53c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.8750000000000000 * cart[2 * ncart + i]; - tmp += 3.7500000000000000 * cart[7 * ncart + i]; - tmp += 1.8750000000000000 * cart[16 * ncart + i]; - tmp += -5.0000000000000000 * cart[9 * ncart + i]; - tmp += -5.0000000000000000 * cart[18 * ncart + i]; - tmp += cart[20 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_53s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.4841229182759271 * cart[i]; - tmp += 0.9682458365518543 * cart[3 * ncart + i]; - tmp += 0.4841229182759271 * cart[10 * ncart + i]; - tmp += -5.8094750193111251 * cart[5 * ncart + i]; - tmp += -5.8094750193111251 * cart[12 * ncart + i]; - tmp += 3.8729833462074170 * cart[14 * ncart + i]; - output[i] += tmp * vector[6]; - - } - - // R_54c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.5617376914898995 * cart[2 * ncart + i]; - tmp += 2.5617376914898995 * cart[16 * ncart + i]; - tmp += 5.1234753829797990 * cart[9 * ncart + i]; - tmp += -5.1234753829797990 * cart[18 * ncart + i]; - output[i] += tmp * vector[7]; - - } - // R_54s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.5229125165837972 * cart[i]; - tmp += 1.0458250331675945 * cart[3 * ncart + i]; - tmp += 1.5687375497513916 * cart[10 * ncart + i]; - tmp += 4.1833001326703778 * cart[5 * ncart + i]; - tmp += -12.5499003980111326 * cart[12 * ncart + i]; - output[i] += tmp * vector[8]; - - } - - // R_55c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.2185299186623562 * cart[2 * ncart + i]; - tmp += -13.3111795119741370 * cart[7 * ncart + i]; - tmp += 2.2185299186623562 * cart[16 * ncart + i]; - output[i] += tmp * vector[9]; - - } - // R_55s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.7015607600201140 * cart[i]; - tmp += -7.0156076002011405 * cart[3 * ncart + i]; - tmp += 3.5078038001005702 * cart[10 * ncart + i]; - output[i] += tmp * vector[10]; - - } - -} -void gg_cca_cart_to_spherical_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_60 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 4.0301597362883772 * cart[ncart + i]; - spherical[i] += -13.4338657876279228 * cart[6 * ncart + i]; - spherical[i] += 4.0301597362883772 * cart[15 * ncart + i]; - - } - - // R_61c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 11.6340690431164280 * cart[4 * ncart + i]; - spherical[nspherical + i] += -23.2681380862328560 * cart[11 * ncart + i]; - spherical[nspherical + i] += 2.3268138086232857 * cart[22 * ncart + i]; - - } - // R_61s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -1.9843134832984430 * cart[ncart + i]; - spherical[2 * nspherical + i] += 1.9843134832984430 * cart[15 * ncart + i]; - spherical[2 * nspherical + i] += 19.8431348329844290 * cart[8 * ncart + i]; - spherical[2 * nspherical + i] += -19.8431348329844290 * cart[17 * ncart + i]; - - } - - // R_62c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = -8.1513994197315593 * cart[4 * ncart + i]; - spherical[3 * nspherical + i] += -5.4342662798210393 * cart[11 * ncart + i]; - spherical[3 * nspherical + i] += 2.7171331399105196 * cart[22 * ncart + i]; - spherical[3 * nspherical + i] += 21.7370651192841571 * cart[13 * ncart + i]; - spherical[3 * nspherical + i] += -7.2456883730947190 * cart[24 * ncart + i]; - - } - // R_62s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 0.9057110466368399 * cart[ncart + i]; - spherical[4 * nspherical + i] += 1.8114220932736798 * cart[6 * ncart + i]; - spherical[4 * nspherical + i] += 0.9057110466368399 * cart[15 * ncart + i]; - spherical[4 * nspherical + i] += -14.4913767461894381 * cart[8 * ncart + i]; - spherical[4 * nspherical + i] += -14.4913767461894381 * cart[17 * ncart + i]; - spherical[4 * nspherical + i] += 14.4913767461894381 * cart[19 * ncart + i]; - - } - - // R_63c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = 2.8641098093473998 * cart[4 * ncart + i]; - spherical[5 * nspherical + i] += 5.7282196186947996 * cart[11 * ncart + i]; - spherical[5 * nspherical + i] += 2.8641098093473998 * cart[22 * ncart + i]; - spherical[5 * nspherical + i] += -11.4564392373895991 * cart[13 * ncart + i]; - spherical[5 * nspherical + i] += -11.4564392373895991 * cart[24 * ncart + i]; - spherical[5 * nspherical + i] += 4.5825756949558398 * cart[26 * ncart + i]; - - } - // R_63s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = -0.3125000000000000 * cart[i]; - spherical[6 * nspherical + i] += -0.9375000000000000 * cart[3 * ncart + i]; - spherical[6 * nspherical + i] += -0.9375000000000000 * cart[10 * ncart + i]; - spherical[6 * nspherical + i] += -0.3125000000000000 * cart[21 * ncart + i]; - spherical[6 * nspherical + i] += 5.6250000000000000 * cart[5 * ncart + i]; - spherical[6 * nspherical + i] += 11.2500000000000000 * cart[12 * ncart + i]; - spherical[6 * nspherical + i] += 5.6250000000000000 * cart[23 * ncart + i]; - spherical[6 * nspherical + i] += -7.5000000000000000 * cart[14 * ncart + i]; - spherical[6 * nspherical + i] += -7.5000000000000000 * cart[25 * ncart + i]; - spherical[6 * nspherical + i] += cart[27 * ncart + i]; - - } - - // R_64c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[7 * nspherical + i] = 2.8641098093473998 * cart[2 * ncart + i]; - spherical[7 * nspherical + i] += 5.7282196186947996 * cart[7 * ncart + i]; - spherical[7 * nspherical + i] += 2.8641098093473998 * cart[16 * ncart + i]; - spherical[7 * nspherical + i] += -11.4564392373895991 * cart[9 * ncart + i]; - spherical[7 * nspherical + i] += -11.4564392373895991 * cart[18 * ncart + i]; - spherical[7 * nspherical + i] += 4.5825756949558398 * cart[20 * ncart + i]; - - } - // R_64s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[8 * nspherical + i] = 0.4528555233184199 * cart[i]; - spherical[8 * nspherical + i] += 0.4528555233184199 * cart[3 * ncart + i]; - spherical[8 * nspherical + i] += -0.4528555233184199 * cart[10 * ncart + i]; - spherical[8 * nspherical + i] += -0.4528555233184199 * cart[21 * ncart + i]; - spherical[8 * nspherical + i] += -7.2456883730947190 * cart[5 * ncart + i]; - spherical[8 * nspherical + i] += 7.2456883730947190 * cart[23 * ncart + i]; - spherical[8 * nspherical + i] += 7.2456883730947190 * cart[14 * ncart + i]; - spherical[8 * nspherical + i] += -7.2456883730947190 * cart[25 * ncart + i]; - - } - - // R_65c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[9 * nspherical + i] = -2.7171331399105196 * cart[2 * ncart + i]; - spherical[9 * nspherical + i] += 5.4342662798210393 * cart[7 * ncart + i]; - spherical[9 * nspherical + i] += 8.1513994197315593 * cart[16 * ncart + i]; - spherical[9 * nspherical + i] += 7.2456883730947190 * cart[9 * ncart + i]; - spherical[9 * nspherical + i] += -21.7370651192841571 * cart[18 * ncart + i]; - - } - // R_65s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[10 * nspherical + i] = -0.4960783708246108 * cart[i]; - spherical[10 * nspherical + i] += 2.4803918541230536 * cart[3 * ncart + i]; - spherical[10 * nspherical + i] += 2.4803918541230536 * cart[10 * ncart + i]; - spherical[10 * nspherical + i] += -0.4960783708246108 * cart[21 * ncart + i]; - spherical[10 * nspherical + i] += 4.9607837082461073 * cart[5 * ncart + i]; - spherical[10 * nspherical + i] += -29.7647022494766453 * cart[12 * ncart + i]; - spherical[10 * nspherical + i] += 4.9607837082461073 * cart[23 * ncart + i]; - - } - - // R_66c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[11 * nspherical + i] = 2.3268138086232857 * cart[2 * ncart + i]; - spherical[11 * nspherical + i] += -23.2681380862328560 * cart[7 * ncart + i]; - spherical[11 * nspherical + i] += 11.6340690431164280 * cart[16 * ncart + i]; - - } - // R_66s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[12 * nspherical + i] = 0.6716932893813962 * cart[i]; - spherical[12 * nspherical + i] += -10.0753993407209421 * cart[3 * ncart + i]; - spherical[12 * nspherical + i] += 10.0753993407209421 * cart[10 * ncart + i]; - spherical[12 * nspherical + i] += -0.6716932893813962 * cart[21 * ncart + i]; - - } - -} -void gg_cca_cart_to_spherical_sum_L6(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_60 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 4.0301597362883772 * cart[ncart + i]; - tmp += -13.4338657876279228 * cart[6 * ncart + i]; - tmp += 4.0301597362883772 * cart[15 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_61c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 11.6340690431164280 * cart[4 * ncart + i]; - tmp += -23.2681380862328560 * cart[11 * ncart + i]; - tmp += 2.3268138086232857 * cart[22 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_61s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.9843134832984430 * cart[ncart + i]; - tmp += 1.9843134832984430 * cart[15 * ncart + i]; - tmp += 19.8431348329844290 * cart[8 * ncart + i]; - tmp += -19.8431348329844290 * cart[17 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_62c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -8.1513994197315593 * cart[4 * ncart + i]; - tmp += -5.4342662798210393 * cart[11 * ncart + i]; - tmp += 2.7171331399105196 * cart[22 * ncart + i]; - tmp += 21.7370651192841571 * cart[13 * ncart + i]; - tmp += -7.2456883730947190 * cart[24 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_62s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.9057110466368399 * cart[ncart + i]; - tmp += 1.8114220932736798 * cart[6 * ncart + i]; - tmp += 0.9057110466368399 * cart[15 * ncart + i]; - tmp += -14.4913767461894381 * cart[8 * ncart + i]; - tmp += -14.4913767461894381 * cart[17 * ncart + i]; - tmp += 14.4913767461894381 * cart[19 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_63c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.8641098093473998 * cart[4 * ncart + i]; - tmp += 5.7282196186947996 * cart[11 * ncart + i]; - tmp += 2.8641098093473998 * cart[22 * ncart + i]; - tmp += -11.4564392373895991 * cart[13 * ncart + i]; - tmp += -11.4564392373895991 * cart[24 * ncart + i]; - tmp += 4.5825756949558398 * cart[26 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_63s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.3125000000000000 * cart[i]; - tmp += -0.9375000000000000 * cart[3 * ncart + i]; - tmp += -0.9375000000000000 * cart[10 * ncart + i]; - tmp += -0.3125000000000000 * cart[21 * ncart + i]; - tmp += 5.6250000000000000 * cart[5 * ncart + i]; - tmp += 11.2500000000000000 * cart[12 * ncart + i]; - tmp += 5.6250000000000000 * cart[23 * ncart + i]; - tmp += -7.5000000000000000 * cart[14 * ncart + i]; - tmp += -7.5000000000000000 * cart[25 * ncart + i]; - tmp += cart[27 * ncart + i]; - output[i] += tmp * vector[6]; - - } - - // R_64c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.8641098093473998 * cart[2 * ncart + i]; - tmp += 5.7282196186947996 * cart[7 * ncart + i]; - tmp += 2.8641098093473998 * cart[16 * ncart + i]; - tmp += -11.4564392373895991 * cart[9 * ncart + i]; - tmp += -11.4564392373895991 * cart[18 * ncart + i]; - tmp += 4.5825756949558398 * cart[20 * ncart + i]; - output[i] += tmp * vector[7]; - - } - // R_64s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.4528555233184199 * cart[i]; - tmp += 0.4528555233184199 * cart[3 * ncart + i]; - tmp += -0.4528555233184199 * cart[10 * ncart + i]; - tmp += -0.4528555233184199 * cart[21 * ncart + i]; - tmp += -7.2456883730947190 * cart[5 * ncart + i]; - tmp += 7.2456883730947190 * cart[23 * ncart + i]; - tmp += 7.2456883730947190 * cart[14 * ncart + i]; - tmp += -7.2456883730947190 * cart[25 * ncart + i]; - output[i] += tmp * vector[8]; - - } - - // R_65c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.7171331399105196 * cart[2 * ncart + i]; - tmp += 5.4342662798210393 * cart[7 * ncart + i]; - tmp += 8.1513994197315593 * cart[16 * ncart + i]; - tmp += 7.2456883730947190 * cart[9 * ncart + i]; - tmp += -21.7370651192841571 * cart[18 * ncart + i]; - output[i] += tmp * vector[9]; - - } - // R_65s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.4960783708246108 * cart[i]; - tmp += 2.4803918541230536 * cart[3 * ncart + i]; - tmp += 2.4803918541230536 * cart[10 * ncart + i]; - tmp += -0.4960783708246108 * cart[21 * ncart + i]; - tmp += 4.9607837082461073 * cart[5 * ncart + i]; - tmp += -29.7647022494766453 * cart[12 * ncart + i]; - tmp += 4.9607837082461073 * cart[23 * ncart + i]; - output[i] += tmp * vector[10]; - - } - - // R_66c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.3268138086232857 * cart[2 * ncart + i]; - tmp += -23.2681380862328560 * cart[7 * ncart + i]; - tmp += 11.6340690431164280 * cart[16 * ncart + i]; - output[i] += tmp * vector[11]; - - } - // R_66s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.6716932893813962 * cart[i]; - tmp += -10.0753993407209421 * cart[3 * ncart + i]; - tmp += 10.0753993407209421 * cart[10 * ncart + i]; - tmp += -0.6716932893813962 * cart[21 * ncart + i]; - output[i] += tmp * vector[12]; - - } - -} -void gg_gaussian_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_00 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = cart[i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L0(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_00 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[i]; - output[i] += tmp * vector[0]; - - } - -} -void gg_gaussian_cart_to_spherical_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_10 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = cart[2 * ncart + i]; - - } - - // R_11c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = cart[i]; - - } - // R_11s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = cart[ncart + i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L1(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_10 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[2 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_11c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[i]; - output[i] += tmp * vector[1]; - - } - // R_11s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = cart[ncart + i]; - output[i] += tmp * vector[2]; - - } - -} -void gg_gaussian_cart_to_spherical_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_20 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = -0.5000000000000000 * cart[i]; - spherical[i] += -0.5000000000000000 * cart[3 * ncart + i]; - spherical[i] += cart[5 * ncart + i]; - - } - - // R_21c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 1.7320508075688772 * cart[2 * ncart + i]; - - } - // R_21s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = 1.7320508075688772 * cart[4 * ncart + i]; - - } - - // R_22c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = 0.8660254037844386 * cart[i]; - spherical[3 * nspherical + i] += -0.8660254037844386 * cart[3 * ncart + i]; - - } - // R_22s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 1.7320508075688772 * cart[ncart + i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L2(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_20 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.5000000000000000 * cart[i]; - tmp += -0.5000000000000000 * cart[3 * ncart + i]; - tmp += cart[5 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_21c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.7320508075688772 * cart[2 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_21s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.7320508075688772 * cart[4 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_22c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.8660254037844386 * cart[i]; - tmp += -0.8660254037844386 * cart[3 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_22s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.7320508075688772 * cart[ncart + i]; - output[i] += tmp * vector[4]; - - } - -} -void gg_gaussian_cart_to_spherical_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_30 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = -1.5000000000000000 * cart[2 * ncart + i]; - spherical[i] += -1.5000000000000000 * cart[7 * ncart + i]; - spherical[i] += cart[9 * ncart + i]; - - } - - // R_31c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = -0.6123724356957945 * cart[i]; - spherical[nspherical + i] += -0.6123724356957945 * cart[3 * ncart + i]; - spherical[nspherical + i] += 2.4494897427831779 * cart[5 * ncart + i]; - - } - // R_31s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -0.6123724356957945 * cart[ncart + i]; - spherical[2 * nspherical + i] += -0.6123724356957945 * cart[6 * ncart + i]; - spherical[2 * nspherical + i] += 2.4494897427831779 * cart[8 * ncart + i]; - - } - - // R_32c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = 1.9364916731037085 * cart[2 * ncart + i]; - spherical[3 * nspherical + i] += -1.9364916731037085 * cart[7 * ncart + i]; - - } - // R_32s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 3.8729833462074170 * cart[4 * ncart + i]; - - } - - // R_33c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = 0.7905694150420949 * cart[i]; - spherical[5 * nspherical + i] += -2.3717082451262845 * cart[3 * ncart + i]; - - } - // R_33s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = 2.3717082451262845 * cart[ncart + i]; - spherical[6 * nspherical + i] += -0.7905694150420949 * cart[6 * ncart + i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L3(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_30 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.5000000000000000 * cart[2 * ncart + i]; - tmp += -1.5000000000000000 * cart[7 * ncart + i]; - tmp += cart[9 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_31c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.6123724356957945 * cart[i]; - tmp += -0.6123724356957945 * cart[3 * ncart + i]; - tmp += 2.4494897427831779 * cart[5 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_31s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.6123724356957945 * cart[ncart + i]; - tmp += -0.6123724356957945 * cart[6 * ncart + i]; - tmp += 2.4494897427831779 * cart[8 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_32c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.9364916731037085 * cart[2 * ncart + i]; - tmp += -1.9364916731037085 * cart[7 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_32s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 3.8729833462074170 * cart[4 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_33c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.7905694150420949 * cart[i]; - tmp += -2.3717082451262845 * cart[3 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_33s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.3717082451262845 * cart[ncart + i]; - tmp += -0.7905694150420949 * cart[6 * ncart + i]; - output[i] += tmp * vector[6]; - - } - -} -void gg_gaussian_cart_to_spherical_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_40 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 0.3750000000000000 * cart[i]; - spherical[i] += 0.7500000000000000 * cart[3 * ncart + i]; - spherical[i] += 0.3750000000000000 * cart[10 * ncart + i]; - spherical[i] += -3.0000000000000000 * cart[5 * ncart + i]; - spherical[i] += -3.0000000000000000 * cart[12 * ncart + i]; - spherical[i] += cart[14 * ncart + i]; - - } - - // R_41c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = -2.3717082451262845 * cart[2 * ncart + i]; - spherical[nspherical + i] += -2.3717082451262845 * cart[7 * ncart + i]; - spherical[nspherical + i] += 3.1622776601683795 * cart[9 * ncart + i]; - - } - // R_41s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = -2.3717082451262845 * cart[4 * ncart + i]; - spherical[2 * nspherical + i] += -2.3717082451262845 * cart[11 * ncart + i]; - spherical[2 * nspherical + i] += 3.1622776601683795 * cart[13 * ncart + i]; - - } - - // R_42c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = -0.5590169943749475 * cart[i]; - spherical[3 * nspherical + i] += 0.5590169943749475 * cart[10 * ncart + i]; - spherical[3 * nspherical + i] += 3.3541019662496847 * cart[5 * ncart + i]; - spherical[3 * nspherical + i] += -3.3541019662496847 * cart[12 * ncart + i]; - - } - // R_42s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = -1.1180339887498949 * cart[ncart + i]; - spherical[4 * nspherical + i] += -1.1180339887498949 * cart[6 * ncart + i]; - spherical[4 * nspherical + i] += 6.7082039324993694 * cart[8 * ncart + i]; - - } - - // R_43c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = 2.0916500663351889 * cart[2 * ncart + i]; - spherical[5 * nspherical + i] += -6.2749501990055663 * cart[7 * ncart + i]; - - } - // R_43s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = 6.2749501990055663 * cart[4 * ncart + i]; - spherical[6 * nspherical + i] += -2.0916500663351889 * cart[11 * ncart + i]; - - } - - // R_44c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[7 * nspherical + i] = 0.7395099728874520 * cart[i]; - spherical[7 * nspherical + i] += -4.4370598373247123 * cart[3 * ncart + i]; - spherical[7 * nspherical + i] += 0.7395099728874520 * cart[10 * ncart + i]; - - } - // R_44s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[8 * nspherical + i] = 2.9580398915498081 * cart[ncart + i]; - spherical[8 * nspherical + i] += -2.9580398915498081 * cart[6 * ncart + i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L4(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_40 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.3750000000000000 * cart[i]; - tmp += 0.7500000000000000 * cart[3 * ncart + i]; - tmp += 0.3750000000000000 * cart[10 * ncart + i]; - tmp += -3.0000000000000000 * cart[5 * ncart + i]; - tmp += -3.0000000000000000 * cart[12 * ncart + i]; - tmp += cart[14 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_41c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.3717082451262845 * cart[2 * ncart + i]; - tmp += -2.3717082451262845 * cart[7 * ncart + i]; - tmp += 3.1622776601683795 * cart[9 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_41s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.3717082451262845 * cart[4 * ncart + i]; - tmp += -2.3717082451262845 * cart[11 * ncart + i]; - tmp += 3.1622776601683795 * cart[13 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_42c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.5590169943749475 * cart[i]; - tmp += 0.5590169943749475 * cart[10 * ncart + i]; - tmp += 3.3541019662496847 * cart[5 * ncart + i]; - tmp += -3.3541019662496847 * cart[12 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_42s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.1180339887498949 * cart[ncart + i]; - tmp += -1.1180339887498949 * cart[6 * ncart + i]; - tmp += 6.7082039324993694 * cart[8 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_43c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.0916500663351889 * cart[2 * ncart + i]; - tmp += -6.2749501990055663 * cart[7 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_43s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 6.2749501990055663 * cart[4 * ncart + i]; - tmp += -2.0916500663351889 * cart[11 * ncart + i]; - output[i] += tmp * vector[6]; - - } - - // R_44c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.7395099728874520 * cart[i]; - tmp += -4.4370598373247123 * cart[3 * ncart + i]; - tmp += 0.7395099728874520 * cart[10 * ncart + i]; - output[i] += tmp * vector[7]; - - } - // R_44s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.9580398915498081 * cart[ncart + i]; - tmp += -2.9580398915498081 * cart[6 * ncart + i]; - output[i] += tmp * vector[8]; - - } - -} -void gg_gaussian_cart_to_spherical_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_50 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = 1.8750000000000000 * cart[2 * ncart + i]; - spherical[i] += 3.7500000000000000 * cart[7 * ncart + i]; - spherical[i] += 1.8750000000000000 * cart[16 * ncart + i]; - spherical[i] += -5.0000000000000000 * cart[9 * ncart + i]; - spherical[i] += -5.0000000000000000 * cart[18 * ncart + i]; - spherical[i] += cart[20 * ncart + i]; - - } - - // R_51c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 0.4841229182759271 * cart[i]; - spherical[nspherical + i] += 0.9682458365518543 * cart[3 * ncart + i]; - spherical[nspherical + i] += 0.4841229182759271 * cart[10 * ncart + i]; - spherical[nspherical + i] += -5.8094750193111251 * cart[5 * ncart + i]; - spherical[nspherical + i] += -5.8094750193111251 * cart[12 * ncart + i]; - spherical[nspherical + i] += 3.8729833462074170 * cart[14 * ncart + i]; - - } - // R_51s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = 0.4841229182759271 * cart[ncart + i]; - spherical[2 * nspherical + i] += 0.9682458365518543 * cart[6 * ncart + i]; - spherical[2 * nspherical + i] += 0.4841229182759271 * cart[15 * ncart + i]; - spherical[2 * nspherical + i] += -5.8094750193111251 * cart[8 * ncart + i]; - spherical[2 * nspherical + i] += -5.8094750193111251 * cart[17 * ncart + i]; - spherical[2 * nspherical + i] += 3.8729833462074170 * cart[19 * ncart + i]; - - } - - // R_52c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = -2.5617376914898995 * cart[2 * ncart + i]; - spherical[3 * nspherical + i] += 2.5617376914898995 * cart[16 * ncart + i]; - spherical[3 * nspherical + i] += 5.1234753829797990 * cart[9 * ncart + i]; - spherical[3 * nspherical + i] += -5.1234753829797990 * cart[18 * ncart + i]; - - } - // R_52s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = -5.1234753829797990 * cart[4 * ncart + i]; - spherical[4 * nspherical + i] += -5.1234753829797990 * cart[11 * ncart + i]; - spherical[4 * nspherical + i] += 10.2469507659595980 * cart[13 * ncart + i]; - - } - - // R_53c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = -0.5229125165837972 * cart[i]; - spherical[5 * nspherical + i] += 1.0458250331675945 * cart[3 * ncart + i]; - spherical[5 * nspherical + i] += 1.5687375497513916 * cart[10 * ncart + i]; - spherical[5 * nspherical + i] += 4.1833001326703778 * cart[5 * ncart + i]; - spherical[5 * nspherical + i] += -12.5499003980111326 * cart[12 * ncart + i]; - - } - // R_53s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = -1.5687375497513916 * cart[ncart + i]; - spherical[6 * nspherical + i] += -1.0458250331675945 * cart[6 * ncart + i]; - spherical[6 * nspherical + i] += 0.5229125165837972 * cart[15 * ncart + i]; - spherical[6 * nspherical + i] += 12.5499003980111326 * cart[8 * ncart + i]; - spherical[6 * nspherical + i] += -4.1833001326703778 * cart[17 * ncart + i]; - - } - - // R_54c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[7 * nspherical + i] = 2.2185299186623562 * cart[2 * ncart + i]; - spherical[7 * nspherical + i] += -13.3111795119741370 * cart[7 * ncart + i]; - spherical[7 * nspherical + i] += 2.2185299186623562 * cart[16 * ncart + i]; - - } - // R_54s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[8 * nspherical + i] = 8.8741196746494246 * cart[4 * ncart + i]; - spherical[8 * nspherical + i] += -8.8741196746494246 * cart[11 * ncart + i]; - - } - - // R_55c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[9 * nspherical + i] = 0.7015607600201140 * cart[i]; - spherical[9 * nspherical + i] += -7.0156076002011405 * cart[3 * ncart + i]; - spherical[9 * nspherical + i] += 3.5078038001005702 * cart[10 * ncart + i]; - - } - // R_55s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[10 * nspherical + i] = 3.5078038001005702 * cart[ncart + i]; - spherical[10 * nspherical + i] += -7.0156076002011405 * cart[6 * ncart + i]; - spherical[10 * nspherical + i] += 0.7015607600201140 * cart[15 * ncart + i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L5(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_50 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 1.8750000000000000 * cart[2 * ncart + i]; - tmp += 3.7500000000000000 * cart[7 * ncart + i]; - tmp += 1.8750000000000000 * cart[16 * ncart + i]; - tmp += -5.0000000000000000 * cart[9 * ncart + i]; - tmp += -5.0000000000000000 * cart[18 * ncart + i]; - tmp += cart[20 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_51c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.4841229182759271 * cart[i]; - tmp += 0.9682458365518543 * cart[3 * ncart + i]; - tmp += 0.4841229182759271 * cart[10 * ncart + i]; - tmp += -5.8094750193111251 * cart[5 * ncart + i]; - tmp += -5.8094750193111251 * cart[12 * ncart + i]; - tmp += 3.8729833462074170 * cart[14 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_51s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.4841229182759271 * cart[ncart + i]; - tmp += 0.9682458365518543 * cart[6 * ncart + i]; - tmp += 0.4841229182759271 * cart[15 * ncart + i]; - tmp += -5.8094750193111251 * cart[8 * ncart + i]; - tmp += -5.8094750193111251 * cart[17 * ncart + i]; - tmp += 3.8729833462074170 * cart[19 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_52c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.5617376914898995 * cart[2 * ncart + i]; - tmp += 2.5617376914898995 * cart[16 * ncart + i]; - tmp += 5.1234753829797990 * cart[9 * ncart + i]; - tmp += -5.1234753829797990 * cart[18 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_52s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -5.1234753829797990 * cart[4 * ncart + i]; - tmp += -5.1234753829797990 * cart[11 * ncart + i]; - tmp += 10.2469507659595980 * cart[13 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_53c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.5229125165837972 * cart[i]; - tmp += 1.0458250331675945 * cart[3 * ncart + i]; - tmp += 1.5687375497513916 * cart[10 * ncart + i]; - tmp += 4.1833001326703778 * cart[5 * ncart + i]; - tmp += -12.5499003980111326 * cart[12 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_53s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.5687375497513916 * cart[ncart + i]; - tmp += -1.0458250331675945 * cart[6 * ncart + i]; - tmp += 0.5229125165837972 * cart[15 * ncart + i]; - tmp += 12.5499003980111326 * cart[8 * ncart + i]; - tmp += -4.1833001326703778 * cart[17 * ncart + i]; - output[i] += tmp * vector[6]; - - } - - // R_54c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.2185299186623562 * cart[2 * ncart + i]; - tmp += -13.3111795119741370 * cart[7 * ncart + i]; - tmp += 2.2185299186623562 * cart[16 * ncart + i]; - output[i] += tmp * vector[7]; - - } - // R_54s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 8.8741196746494246 * cart[4 * ncart + i]; - tmp += -8.8741196746494246 * cart[11 * ncart + i]; - output[i] += tmp * vector[8]; - - } - - // R_55c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.7015607600201140 * cart[i]; - tmp += -7.0156076002011405 * cart[3 * ncart + i]; - tmp += 3.5078038001005702 * cart[10 * ncart + i]; - output[i] += tmp * vector[9]; - - } - // R_55s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 3.5078038001005702 * cart[ncart + i]; - tmp += -7.0156076002011405 * cart[6 * ncart + i]; - tmp += 0.7015607600201140 * cart[15 * ncart + i]; - output[i] += tmp * vector[10]; - - } - -} -void gg_gaussian_cart_to_spherical_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // R_60 Transform - for (unsigned long i = 0; i < size; i++) { - spherical[i] = -0.3125000000000000 * cart[i]; - spherical[i] += -0.9375000000000000 * cart[3 * ncart + i]; - spherical[i] += -0.9375000000000000 * cart[10 * ncart + i]; - spherical[i] += -0.3125000000000000 * cart[21 * ncart + i]; - spherical[i] += 5.6250000000000000 * cart[5 * ncart + i]; - spherical[i] += 11.2500000000000000 * cart[12 * ncart + i]; - spherical[i] += 5.6250000000000000 * cart[23 * ncart + i]; - spherical[i] += -7.5000000000000000 * cart[14 * ncart + i]; - spherical[i] += -7.5000000000000000 * cart[25 * ncart + i]; - spherical[i] += cart[27 * ncart + i]; - - } - - // R_61c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[nspherical + i] = 2.8641098093473998 * cart[2 * ncart + i]; - spherical[nspherical + i] += 5.7282196186947996 * cart[7 * ncart + i]; - spherical[nspherical + i] += 2.8641098093473998 * cart[16 * ncart + i]; - spherical[nspherical + i] += -11.4564392373895991 * cart[9 * ncart + i]; - spherical[nspherical + i] += -11.4564392373895991 * cart[18 * ncart + i]; - spherical[nspherical + i] += 4.5825756949558398 * cart[20 * ncart + i]; - - } - // R_61s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[2 * nspherical + i] = 2.8641098093473998 * cart[4 * ncart + i]; - spherical[2 * nspherical + i] += 5.7282196186947996 * cart[11 * ncart + i]; - spherical[2 * nspherical + i] += 2.8641098093473998 * cart[22 * ncart + i]; - spherical[2 * nspherical + i] += -11.4564392373895991 * cart[13 * ncart + i]; - spherical[2 * nspherical + i] += -11.4564392373895991 * cart[24 * ncart + i]; - spherical[2 * nspherical + i] += 4.5825756949558398 * cart[26 * ncart + i]; - - } - - // R_62c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[3 * nspherical + i] = 0.4528555233184199 * cart[i]; - spherical[3 * nspherical + i] += 0.4528555233184199 * cart[3 * ncart + i]; - spherical[3 * nspherical + i] += -0.4528555233184199 * cart[10 * ncart + i]; - spherical[3 * nspherical + i] += -0.4528555233184199 * cart[21 * ncart + i]; - spherical[3 * nspherical + i] += -7.2456883730947190 * cart[5 * ncart + i]; - spherical[3 * nspherical + i] += 7.2456883730947190 * cart[23 * ncart + i]; - spherical[3 * nspherical + i] += 7.2456883730947190 * cart[14 * ncart + i]; - spherical[3 * nspherical + i] += -7.2456883730947190 * cart[25 * ncart + i]; - - } - // R_62s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[4 * nspherical + i] = 0.9057110466368399 * cart[ncart + i]; - spherical[4 * nspherical + i] += 1.8114220932736798 * cart[6 * ncart + i]; - spherical[4 * nspherical + i] += 0.9057110466368399 * cart[15 * ncart + i]; - spherical[4 * nspherical + i] += -14.4913767461894381 * cart[8 * ncart + i]; - spherical[4 * nspherical + i] += -14.4913767461894381 * cart[17 * ncart + i]; - spherical[4 * nspherical + i] += 14.4913767461894381 * cart[19 * ncart + i]; - - } - - // R_63c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[5 * nspherical + i] = -2.7171331399105196 * cart[2 * ncart + i]; - spherical[5 * nspherical + i] += 5.4342662798210393 * cart[7 * ncart + i]; - spherical[5 * nspherical + i] += 8.1513994197315593 * cart[16 * ncart + i]; - spherical[5 * nspherical + i] += 7.2456883730947190 * cart[9 * ncart + i]; - spherical[5 * nspherical + i] += -21.7370651192841571 * cart[18 * ncart + i]; - - } - // R_63s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[6 * nspherical + i] = -8.1513994197315593 * cart[4 * ncart + i]; - spherical[6 * nspherical + i] += -5.4342662798210393 * cart[11 * ncart + i]; - spherical[6 * nspherical + i] += 2.7171331399105196 * cart[22 * ncart + i]; - spherical[6 * nspherical + i] += 21.7370651192841571 * cart[13 * ncart + i]; - spherical[6 * nspherical + i] += -7.2456883730947190 * cart[24 * ncart + i]; - - } - - // R_64c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[7 * nspherical + i] = -0.4960783708246108 * cart[i]; - spherical[7 * nspherical + i] += 2.4803918541230536 * cart[3 * ncart + i]; - spherical[7 * nspherical + i] += 2.4803918541230536 * cart[10 * ncart + i]; - spherical[7 * nspherical + i] += -0.4960783708246108 * cart[21 * ncart + i]; - spherical[7 * nspherical + i] += 4.9607837082461073 * cart[5 * ncart + i]; - spherical[7 * nspherical + i] += -29.7647022494766453 * cart[12 * ncart + i]; - spherical[7 * nspherical + i] += 4.9607837082461073 * cart[23 * ncart + i]; - - } - // R_64s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[8 * nspherical + i] = -1.9843134832984430 * cart[ncart + i]; - spherical[8 * nspherical + i] += 1.9843134832984430 * cart[15 * ncart + i]; - spherical[8 * nspherical + i] += 19.8431348329844290 * cart[8 * ncart + i]; - spherical[8 * nspherical + i] += -19.8431348329844290 * cart[17 * ncart + i]; - - } - - // R_65c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[9 * nspherical + i] = 2.3268138086232857 * cart[2 * ncart + i]; - spherical[9 * nspherical + i] += -23.2681380862328560 * cart[7 * ncart + i]; - spherical[9 * nspherical + i] += 11.6340690431164280 * cart[16 * ncart + i]; - - } - // R_65s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[10 * nspherical + i] = 11.6340690431164280 * cart[4 * ncart + i]; - spherical[10 * nspherical + i] += -23.2681380862328560 * cart[11 * ncart + i]; - spherical[10 * nspherical + i] += 2.3268138086232857 * cart[22 * ncart + i]; - - } - - // R_66c Transform - for (unsigned long i = 0; i < size; i++) { - spherical[11 * nspherical + i] = 0.6716932893813962 * cart[i]; - spherical[11 * nspherical + i] += -10.0753993407209421 * cart[3 * ncart + i]; - spherical[11 * nspherical + i] += 10.0753993407209421 * cart[10 * ncart + i]; - spherical[11 * nspherical + i] += -0.6716932893813962 * cart[21 * ncart + i]; - - } - // R_66s Transform - for (unsigned long i = 0; i < size; i++) { - spherical[12 * nspherical + i] = 4.0301597362883772 * cart[ncart + i]; - spherical[12 * nspherical + i] += -13.4338657876279228 * cart[6 * ncart + i]; - spherical[12 * nspherical + i] += 4.0301597362883772 * cart[15 * ncart + i]; - - } - -} -void gg_gaussian_cart_to_spherical_sum_L6(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) { - ASSUME_ALIGNED(cart, 64); - // temps - double tmp; - // R_60 Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.3125000000000000 * cart[i]; - tmp += -0.9375000000000000 * cart[3 * ncart + i]; - tmp += -0.9375000000000000 * cart[10 * ncart + i]; - tmp += -0.3125000000000000 * cart[21 * ncart + i]; - tmp += 5.6250000000000000 * cart[5 * ncart + i]; - tmp += 11.2500000000000000 * cart[12 * ncart + i]; - tmp += 5.6250000000000000 * cart[23 * ncart + i]; - tmp += -7.5000000000000000 * cart[14 * ncart + i]; - tmp += -7.5000000000000000 * cart[25 * ncart + i]; - tmp += cart[27 * ncart + i]; - output[i] += tmp * vector[0]; - - } - - // R_61c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.8641098093473998 * cart[2 * ncart + i]; - tmp += 5.7282196186947996 * cart[7 * ncart + i]; - tmp += 2.8641098093473998 * cart[16 * ncart + i]; - tmp += -11.4564392373895991 * cart[9 * ncart + i]; - tmp += -11.4564392373895991 * cart[18 * ncart + i]; - tmp += 4.5825756949558398 * cart[20 * ncart + i]; - output[i] += tmp * vector[1]; - - } - // R_61s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.8641098093473998 * cart[4 * ncart + i]; - tmp += 5.7282196186947996 * cart[11 * ncart + i]; - tmp += 2.8641098093473998 * cart[22 * ncart + i]; - tmp += -11.4564392373895991 * cart[13 * ncart + i]; - tmp += -11.4564392373895991 * cart[24 * ncart + i]; - tmp += 4.5825756949558398 * cart[26 * ncart + i]; - output[i] += tmp * vector[2]; - - } - - // R_62c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.4528555233184199 * cart[i]; - tmp += 0.4528555233184199 * cart[3 * ncart + i]; - tmp += -0.4528555233184199 * cart[10 * ncart + i]; - tmp += -0.4528555233184199 * cart[21 * ncart + i]; - tmp += -7.2456883730947190 * cart[5 * ncart + i]; - tmp += 7.2456883730947190 * cart[23 * ncart + i]; - tmp += 7.2456883730947190 * cart[14 * ncart + i]; - tmp += -7.2456883730947190 * cart[25 * ncart + i]; - output[i] += tmp * vector[3]; - - } - // R_62s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.9057110466368399 * cart[ncart + i]; - tmp += 1.8114220932736798 * cart[6 * ncart + i]; - tmp += 0.9057110466368399 * cart[15 * ncart + i]; - tmp += -14.4913767461894381 * cart[8 * ncart + i]; - tmp += -14.4913767461894381 * cart[17 * ncart + i]; - tmp += 14.4913767461894381 * cart[19 * ncart + i]; - output[i] += tmp * vector[4]; - - } - - // R_63c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -2.7171331399105196 * cart[2 * ncart + i]; - tmp += 5.4342662798210393 * cart[7 * ncart + i]; - tmp += 8.1513994197315593 * cart[16 * ncart + i]; - tmp += 7.2456883730947190 * cart[9 * ncart + i]; - tmp += -21.7370651192841571 * cart[18 * ncart + i]; - output[i] += tmp * vector[5]; - - } - // R_63s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -8.1513994197315593 * cart[4 * ncart + i]; - tmp += -5.4342662798210393 * cart[11 * ncart + i]; - tmp += 2.7171331399105196 * cart[22 * ncart + i]; - tmp += 21.7370651192841571 * cart[13 * ncart + i]; - tmp += -7.2456883730947190 * cart[24 * ncart + i]; - output[i] += tmp * vector[6]; - - } - - // R_64c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -0.4960783708246108 * cart[i]; - tmp += 2.4803918541230536 * cart[3 * ncart + i]; - tmp += 2.4803918541230536 * cart[10 * ncart + i]; - tmp += -0.4960783708246108 * cart[21 * ncart + i]; - tmp += 4.9607837082461073 * cart[5 * ncart + i]; - tmp += -29.7647022494766453 * cart[12 * ncart + i]; - tmp += 4.9607837082461073 * cart[23 * ncart + i]; - output[i] += tmp * vector[7]; - - } - // R_64s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = -1.9843134832984430 * cart[ncart + i]; - tmp += 1.9843134832984430 * cart[15 * ncart + i]; - tmp += 19.8431348329844290 * cart[8 * ncart + i]; - tmp += -19.8431348329844290 * cart[17 * ncart + i]; - output[i] += tmp * vector[8]; - - } - - // R_65c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 2.3268138086232857 * cart[2 * ncart + i]; - tmp += -23.2681380862328560 * cart[7 * ncart + i]; - tmp += 11.6340690431164280 * cart[16 * ncart + i]; - output[i] += tmp * vector[9]; - - } - // R_65s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 11.6340690431164280 * cart[4 * ncart + i]; - tmp += -23.2681380862328560 * cart[11 * ncart + i]; - tmp += 2.3268138086232857 * cart[22 * ncart + i]; - output[i] += tmp * vector[10]; - - } - - // R_66c Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 0.6716932893813962 * cart[i]; - tmp += -10.0753993407209421 * cart[3 * ncart + i]; - tmp += 10.0753993407209421 * cart[10 * ncart + i]; - tmp += -0.6716932893813962 * cart[21 * ncart + i]; - output[i] += tmp * vector[11]; - - } - // R_66s Transform - for (unsigned long i = 0; i < size; i++) { - tmp = 4.0301597362883772 * cart[ncart + i]; - tmp += -13.4338657876279228 * cart[6 * ncart + i]; - tmp += 4.0301597362883772 * cart[15 * ncart + i]; - output[i] += tmp * vector[12]; - - } - -} -void gg_cca_cart_copy_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (0, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (0, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_cca_cart_copy_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (1, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (1, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_cca_cart_copy_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (2, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (2, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_cca_cart_copy_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (3, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 3, 0) - inp_shift = 6 * ncart_input; - out_shift = 6 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 1) - inp_shift = 7 * ncart_input; - out_shift = 7 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 2) - inp_shift = 8 * ncart_input; - out_shift = 8 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 3) - inp_shift = 9 * ncart_input; - out_shift = 9 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (3, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 3, 0) - in_shift = 6 * ncart_input; - coef = vector[6]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 1) - in_shift = 7 * ncart_input; - coef = vector[7]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 2) - in_shift = 8 * ncart_input; - coef = vector[8]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 3) - in_shift = 9 * ncart_input; - coef = vector[9]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_cca_cart_copy_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (4, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 3, 0) - inp_shift = 6 * ncart_input; - out_shift = 6 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 2, 1) - inp_shift = 7 * ncart_input; - out_shift = 7 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 2) - inp_shift = 8 * ncart_input; - out_shift = 8 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 3) - inp_shift = 9 * ncart_input; - out_shift = 9 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 4, 0) - inp_shift = 10 * ncart_input; - out_shift = 10 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 3, 1) - inp_shift = 11 * ncart_input; - out_shift = 11 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 2) - inp_shift = 12 * ncart_input; - out_shift = 12 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 3) - inp_shift = 13 * ncart_input; - out_shift = 13 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 4) - inp_shift = 14 * ncart_input; - out_shift = 14 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (4, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 3, 0) - in_shift = 6 * ncart_input; - coef = vector[6]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 2, 1) - in_shift = 7 * ncart_input; - coef = vector[7]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 2) - in_shift = 8 * ncart_input; - coef = vector[8]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 3) - in_shift = 9 * ncart_input; - coef = vector[9]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 4, 0) - in_shift = 10 * ncart_input; - coef = vector[10]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 3, 1) - in_shift = 11 * ncart_input; - coef = vector[11]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 2) - in_shift = 12 * ncart_input; - coef = vector[12]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 3) - in_shift = 13 * ncart_input; - coef = vector[13]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 4) - in_shift = 14 * ncart_input; - coef = vector[14]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_cca_cart_copy_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (5, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (4, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (4, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 3, 0) - inp_shift = 6 * ncart_input; - out_shift = 6 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 2, 1) - inp_shift = 7 * ncart_input; - out_shift = 7 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 1, 2) - inp_shift = 8 * ncart_input; - out_shift = 8 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 0, 3) - inp_shift = 9 * ncart_input; - out_shift = 9 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 4, 0) - inp_shift = 10 * ncart_input; - out_shift = 10 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 3, 1) - inp_shift = 11 * ncart_input; - out_shift = 11 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 2, 2) - inp_shift = 12 * ncart_input; - out_shift = 12 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 3) - inp_shift = 13 * ncart_input; - out_shift = 13 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 4) - inp_shift = 14 * ncart_input; - out_shift = 14 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 5, 0) - inp_shift = 15 * ncart_input; - out_shift = 15 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 4, 1) - inp_shift = 16 * ncart_input; - out_shift = 16 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 3, 2) - inp_shift = 17 * ncart_input; - out_shift = 17 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 3) - inp_shift = 18 * ncart_input; - out_shift = 18 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 4) - inp_shift = 19 * ncart_input; - out_shift = 19 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 5) - inp_shift = 20 * ncart_input; - out_shift = 20 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (5, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (4, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (4, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 3, 0) - in_shift = 6 * ncart_input; - coef = vector[6]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 2, 1) - in_shift = 7 * ncart_input; - coef = vector[7]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 1, 2) - in_shift = 8 * ncart_input; - coef = vector[8]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 0, 3) - in_shift = 9 * ncart_input; - coef = vector[9]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 4, 0) - in_shift = 10 * ncart_input; - coef = vector[10]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 3, 1) - in_shift = 11 * ncart_input; - coef = vector[11]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 2, 2) - in_shift = 12 * ncart_input; - coef = vector[12]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 3) - in_shift = 13 * ncart_input; - coef = vector[13]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 4) - in_shift = 14 * ncart_input; - coef = vector[14]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 5, 0) - in_shift = 15 * ncart_input; - coef = vector[15]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 4, 1) - in_shift = 16 * ncart_input; - coef = vector[16]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 3, 2) - in_shift = 17 * ncart_input; - coef = vector[17]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 3) - in_shift = 18 * ncart_input; - coef = vector[18]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 4) - in_shift = 19 * ncart_input; - coef = vector[19]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 5) - in_shift = 20 * ncart_input; - coef = vector[20]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_cca_cart_copy_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (6, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (5, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (5, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (4, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (4, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (4, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 3, 0) - inp_shift = 6 * ncart_input; - out_shift = 6 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 2, 1) - inp_shift = 7 * ncart_input; - out_shift = 7 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 1, 2) - inp_shift = 8 * ncart_input; - out_shift = 8 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 0, 3) - inp_shift = 9 * ncart_input; - out_shift = 9 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 4, 0) - inp_shift = 10 * ncart_input; - out_shift = 10 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 3, 1) - inp_shift = 11 * ncart_input; - out_shift = 11 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 2, 2) - inp_shift = 12 * ncart_input; - out_shift = 12 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 1, 3) - inp_shift = 13 * ncart_input; - out_shift = 13 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 0, 4) - inp_shift = 14 * ncart_input; - out_shift = 14 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 5, 0) - inp_shift = 15 * ncart_input; - out_shift = 15 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 4, 1) - inp_shift = 16 * ncart_input; - out_shift = 16 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 3, 2) - inp_shift = 17 * ncart_input; - out_shift = 17 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 2, 3) - inp_shift = 18 * ncart_input; - out_shift = 18 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 4) - inp_shift = 19 * ncart_input; - out_shift = 19 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 5) - inp_shift = 20 * ncart_input; - out_shift = 20 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 6, 0) - inp_shift = 21 * ncart_input; - out_shift = 21 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 5, 1) - inp_shift = 22 * ncart_input; - out_shift = 22 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 4, 2) - inp_shift = 23 * ncart_input; - out_shift = 23 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 3, 3) - inp_shift = 24 * ncart_input; - out_shift = 24 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 4) - inp_shift = 25 * ncart_input; - out_shift = 25 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 5) - inp_shift = 26 * ncart_input; - out_shift = 26 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 6) - inp_shift = 27 * ncart_input; - out_shift = 27 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_cca_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (6, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (5, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (5, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (4, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (4, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (4, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 3, 0) - in_shift = 6 * ncart_input; - coef = vector[6]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 2, 1) - in_shift = 7 * ncart_input; - coef = vector[7]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 1, 2) - in_shift = 8 * ncart_input; - coef = vector[8]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 0, 3) - in_shift = 9 * ncart_input; - coef = vector[9]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 4, 0) - in_shift = 10 * ncart_input; - coef = vector[10]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 3, 1) - in_shift = 11 * ncart_input; - coef = vector[11]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 2, 2) - in_shift = 12 * ncart_input; - coef = vector[12]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 1, 3) - in_shift = 13 * ncart_input; - coef = vector[13]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 0, 4) - in_shift = 14 * ncart_input; - coef = vector[14]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 5, 0) - in_shift = 15 * ncart_input; - coef = vector[15]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 4, 1) - in_shift = 16 * ncart_input; - coef = vector[16]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 3, 2) - in_shift = 17 * ncart_input; - coef = vector[17]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 2, 3) - in_shift = 18 * ncart_input; - coef = vector[18]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 4) - in_shift = 19 * ncart_input; - coef = vector[19]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 5) - in_shift = 20 * ncart_input; - coef = vector[20]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 6, 0) - in_shift = 21 * ncart_input; - coef = vector[21]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 5, 1) - in_shift = 22 * ncart_input; - coef = vector[22]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 4, 2) - in_shift = 23 * ncart_input; - coef = vector[23]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 3, 3) - in_shift = 24 * ncart_input; - coef = vector[24]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 4) - in_shift = 25 * ncart_input; - coef = vector[25]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 5) - in_shift = 26 * ncart_input; - coef = vector[26]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 6) - in_shift = 27 * ncart_input; - coef = vector[27]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_molden_cart_copy_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (0, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_molden_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (0, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_molden_cart_copy_L1(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (1, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_molden_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (1, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_molden_cart_copy_L2(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (2, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_molden_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (2, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_molden_cart_copy_L3(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (3, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 9 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 6 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 3, 0) - inp_shift = 6 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 1) - inp_shift = 7 * ncart_input; - out_shift = 8 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 2) - inp_shift = 8 * ncart_input; - out_shift = 7 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 3) - inp_shift = 9 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_molden_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (3, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[9]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[6]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 3, 0) - in_shift = 6 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 1) - in_shift = 7 * ncart_input; - coef = vector[8]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 2) - in_shift = 8 * ncart_input; - coef = vector[7]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 3) - in_shift = 9 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_molden_cart_copy_L4(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long inp_shift; - unsigned long out_shift; - - // Copy (4, 0, 0) - inp_shift = 0 * ncart_input; - out_shift = 0 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 1, 0) - inp_shift = 1 * ncart_input; - out_shift = 3 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (3, 0, 1) - inp_shift = 2 * ncart_input; - out_shift = 4 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 2, 0) - inp_shift = 3 * ncart_input; - out_shift = 9 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 1, 1) - inp_shift = 4 * ncart_input; - out_shift = 12 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (2, 0, 2) - inp_shift = 5 * ncart_input; - out_shift = 10 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 3, 0) - inp_shift = 6 * ncart_input; - out_shift = 5 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 2, 1) - inp_shift = 7 * ncart_input; - out_shift = 13 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 1, 2) - inp_shift = 8 * ncart_input; - out_shift = 14 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (1, 0, 3) - inp_shift = 9 * ncart_input; - out_shift = 7 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 4, 0) - inp_shift = 10 * ncart_input; - out_shift = 1 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 3, 1) - inp_shift = 11 * ncart_input; - out_shift = 6 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 2, 2) - inp_shift = 12 * ncart_input; - out_shift = 11 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 1, 3) - inp_shift = 13 * ncart_input; - out_shift = 8 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } - - // Copy (0, 0, 4) - inp_shift = 14 * ncart_input; - out_shift = 2 * ncart_out; - for (unsigned long i = 0; i < size; i++) { - cart_out[out_shift + i] = cart_input[inp_shift + i]; - } -} -void gg_molden_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { - - ASSUME_ALIGNED(cart_input, 64); - unsigned long in_shift; - unsigned long out_shift; - double coef; - - // Copy (4, 0, 0) - in_shift = 0 * ncart_input; - coef = vector[0]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 1, 0) - in_shift = 1 * ncart_input; - coef = vector[3]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (3, 0, 1) - in_shift = 2 * ncart_input; - coef = vector[4]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 2, 0) - in_shift = 3 * ncart_input; - coef = vector[9]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 1, 1) - in_shift = 4 * ncart_input; - coef = vector[12]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (2, 0, 2) - in_shift = 5 * ncart_input; - coef = vector[10]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 3, 0) - in_shift = 6 * ncart_input; - coef = vector[5]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 2, 1) - in_shift = 7 * ncart_input; - coef = vector[13]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 1, 2) - in_shift = 8 * ncart_input; - coef = vector[14]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (1, 0, 3) - in_shift = 9 * ncart_input; - coef = vector[7]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 4, 0) - in_shift = 10 * ncart_input; - coef = vector[1]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 3, 1) - in_shift = 11 * ncart_input; - coef = vector[6]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 2, 2) - in_shift = 12 * ncart_input; - coef = vector[11]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 1, 3) - in_shift = 13 * ncart_input; - coef = vector[8]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } - - // Copy (0, 0, 4) - in_shift = 14 * ncart_input; - coef = vector[2]; - for (unsigned long i = 0; i < size; i++) { - cart_out[i] += coef * cart_input[in_shift + i]; - } -} -void gg_molden_cart_copy_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { -} -void gg_molden_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { -} -void gg_molden_cart_copy_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { -} -void gg_molden_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) { -} -void gg_naive_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, double* PRAGMA_RESTRICT output) { - ASSUME_ALIGNED(input, 64); - for (unsigned long i = 0; i < n; i++) { - for (unsigned long j = 0; j < m; j++) { - output[j * n + i] = input[i * m + j]; - } - } -} -void gg_fast_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, double* PRAGMA_RESTRICT output) { - - // Temps - #ifdef _MSC_VER - __declspec(align(64)) double tmp[64]; - #else - double tmp[64] __attribute__((aligned(64))); - #endif - ASSUME_ALIGNED(input, 64); - // Sizing - unsigned long nblocks = n / 8; - nblocks += (n % 8) ? 1 : 0; - unsigned long mblocks = m / 8; - mblocks += (m % 8) ? 1 : 0; - // Outer blocks - for (unsigned long nb = 0; nb < nblocks; nb++) { - const unsigned long nstart = nb * 8; - unsigned long nremain = ((nstart + 8) > n) ? (n - nstart) : 8; - for (unsigned long mb = 0; mb < mblocks; mb++) { - const unsigned long mstart = mb * 8; - unsigned long mremain = ((mstart + 8) > m) ? (m - mstart) : 8; - // Copy data to inner block - for (unsigned long l = 0; l < nremain; l++) { - const unsigned long start = (nstart + l) * m + mstart; - for (unsigned long k = 0; k < mremain; k++) { - tmp[k * 8 + l] = input[start + k]; - } - } - // Copy data to inner block - for (unsigned long k = 0; k < mremain; k++) { - const unsigned long start = (mstart + k) * n + nstart; - for (unsigned long l = 0; l < nremain; l++) { - output[start + l] = tmp[k * 8 + l]; - } - } - } - } -} -void block_copy(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, unsigned long is, double* PRAGMA_RESTRICT output, unsigned long os, const int trans) { - - ASSUME_ALIGNED(input, 64); - for (unsigned long i = 0; i < n; i++) { - const unsigned long out_shift = i * os; - const unsigned long inp_shift = i * is; - - for (unsigned long j = 0; j < m; j++) { - output[out_shift + j] = input[inp_shift + j]; - } - } -} -void block_matrix_vector(unsigned long n, unsigned long m, const double* vector, const double* PRAGMA_RESTRICT input, unsigned long is, double* PRAGMA_RESTRICT output) { - - ASSUME_ALIGNED(input, 64); - for (unsigned long i = 0; i < n; i++) { - const unsigned long inp_shift = i * is; - const double coef = vector[i]; - - for (unsigned long j = 0; j < m; j++) { - output[j] += coef * input[inp_shift + j]; - } - } -} \ No newline at end of file diff --git a/third_party/gauxc/external/gau2grid/src/CMakeLists.txt b/third_party/gauxc/external/gau2grid/src/CMakeLists.txt deleted file mode 100644 index 2be6f95..0000000 --- a/third_party/gauxc/external/gau2grid/src/CMakeLists.txt +++ /dev/null @@ -1,184 +0,0 @@ -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) - -project(gau2grid - VERSION 2.0.5 - LANGUAGES C) -set(gau2grid_AUTHORS "Daniel G. A. Smith") -set(gau2grid_DESCRIPTION "Fast computation of a gaussian and its derivative on a grid") -set(gau2grid_URL "https://github.com/dgasmith/gau2grid") -set(gau2grid_LICENSE "BSD 3-clause") - -list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) - - -############################# Options: Build How? ############################# -include(psi4OptionsTools) -option_with_default(MAX_AM "The maximum gaussian angular momentum to compile" 8) -option_with_default(CMAKE_BUILD_TYPE "Build type (Release or Debug)" Release) -if(CMAKE_CXX_COMPILER_ID MATCHES Intel) -option_with_flags(ENABLE_XHOST "Enables processor-specific optimization (with MSVC, it enables AVX2 instructions)" ON - "-xHost" "-march=native" "/arch:AVX2") -else() -option_with_flags(ENABLE_XHOST "Enables processor-specific optimization (with MSVC, it enables AVX2 instructions)" ON - "-march=native" "-xHost" "/arch:AVX2") -endif() -option_with_default(BUILD_FPIC "Libraries will be compiled with position independent code" ON) -option_with_print(BUILD_SHARED_LIBS "Build final library as shared, not static" ON) -option_with_default(ENABLE_GENERIC "Enables mostly static linking of system libraries for shared library" OFF) -option_with_default(DISABLE_PRAGMA "Disable certain pragma optimizations, appends _GG_NO_PRAGMA to compile flags" OFF ) - -# Warnings -if((${BUILD_SHARED_LIBS}) AND NOT ${BUILD_FPIC}) - message(FATAL_ERROR "BUILD_SHARED_LIBS ON and BUILD_FPIC OFF are incompatible, as shared library requires position independent code") -endif() - -# Install -option_with_default(CMAKE_INSTALL_LIBDIR "Directory to which libraries installed" lib) -option_with_default(PYMOD_INSTALL_LIBDIR "Location within CMAKE_INSTALL_LIBDIR to which python modules are installed - Must start with: / . Used to imitate python install: /python3.6/site-packages ." /) -option_with_print(INSTALL_PYMOD "Additionally installs as independent python module in PYMOD_INSTALL_LIBDIR" OFF) -option_with_default(NATIVE_PYTHON_INSTALL "For INSTALL_PYMOD=ON, install in Python manner to PYTHON_EXECUTABLE's site-packages rather than Linux manner to prefix. Overrides CMAKE_INSTALL_PREFIX, CMAKE_INSTALL_LIBDIR, PYMOD_INSTALL_LIBDIR. Only Py module installed." OFF) -option_with_print(NATIVE_PYTHON_INSTALL_WITH_LIB "Same as NATIVE_PYTHON_INSTALL except installs library, too, _without_ overriding CMAKE_INSTALL_* options." OFF) - -######################## Process & Validate Options ########################## -include(autocmake_safeguards) -include(custom_color_messages) -include(custom_static_library) - -if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set(CMAKE_INSTALL_PREFIX "/usr/local/gau2grid" CACHE PATH "Install path" FORCE) -endif() -message(STATUS "gau2grid install: ${CMAKE_INSTALL_PREFIX}") - -# << Python >> -set(Python_ADDITIONAL_VERSIONS 3.9 3.8 3.7 3.6 3.5) # adjust with CMake minimum FindPythonInterp -find_package(PythonLibsNew 3.6 REQUIRED) -message(STATUS "${Cyan}Found Python ${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}${ColourReset}: ${PYTHON_EXECUTABLE} (found version ${PYTHON_VERSION_STRING})") - - -################################ Main Project ################################ -add_custom_command( - OUTPUT gau2grid/gau2grid.h gau2grid_orbital.c gau2grid_phi.c gau2grid_deriv1.c gau2grid_deriv2.c gau2grid_deriv3.c gau2grid_transform.c gau2grid_helper.c - COMMAND ${PYTHON_EXECUTABLE} -c "import sys; \ - sys.path.append('${PROJECT_SOURCE_DIR}'); \ - import gau2grid as gg; \ - gg.c_gen.generate_c_gau2grid(${MAX_AM}, path='${CMAKE_CURRENT_BINARY_DIR}')" - DEPENDS gau2grid/c_generator.py - gau2grid/c_generator.py - gau2grid/codegen.py - gau2grid/c_pragma.py - gau2grid/c_util_generator.py - gau2grid/c_wrapper.py - gau2grid/docs_generator.py - gau2grid/order.py - gau2grid/python_reference.py - gau2grid/RSH.py - gau2grid/utility.py - VERBATIM) - -set(sources_list ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_phi.c - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_orbital.c - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_deriv1.c - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_deriv2.c - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_deriv3.c - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_transform.c - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid_helper.c) - -add_library(gg ${sources_list}) -if ("${CMAKE_C_COMPILER_ID}" STREQUAL "PGI") - set_target_properties(gg PROPERTIES COMPILE_FLAGS "-c11") -else() - set_target_properties(gg PROPERTIES COMPILE_FLAGS "-std=c11") -endif() -set_target_properties(gg PROPERTIES POSITION_INDEPENDENT_CODE ${BUILD_FPIC} - SOVERSION 2) # bump whenever interface has changes or removals - -if( DISABLE_PRAGMA ) - target_compile_definitions( gg PRIVATE $ ) -endif() - -find_package(StandardMathLibraryC) -target_link_libraries(gg PRIVATE ${STANDARD_MATH_LIBRARY}) - -if(${BUILD_SHARED_LIBS}) - target_link_libraries(gg PRIVATE ${LIBC_INTERJECT}) -endif() - - -################################### Install ################################## -include(GNUInstallDirs) -include(CMakePackageConfigHelpers) - -set(PN ${PROJECT_NAME}) - -# Alias to allow for consistent manipulation as a subproject -add_library( ${PN}::gg ALIAS gg ) - -target_include_directories(gg PUBLIC - $ - $) - -# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") -configure_package_config_file(cmake/${PN}Config.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" - INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) -write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake - VERSION ${${PN}_VERSION} - COMPATIBILITY SameMajorVersion) - -# Install our files -if(${NATIVE_PYTHON_INSTALL_WITH_LIB} OR (NOT(${INSTALL_PYMOD} AND ${NATIVE_PYTHON_INSTALL}))) - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/gau2grid/gau2grid.h - ${CMAKE_CURRENT_BINARY_DIR}/gau2grid/gau2grid_pragma.h - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PN}) - - install(TARGETS gg - EXPORT "${PN}Targets" - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) - - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake - DESTINATION ${CMAKECONFIG_INSTALL_DIR}) - install(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" - DESTINATION ${CMAKECONFIG_INSTALL_DIR}) - export(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" - FILE "${PROJECT_BINARY_DIR}/${PN}Targets.cmake") -endif() - -if(${INSTALL_PYMOD}) - if(${NATIVE_PYTHON_INSTALL}) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c - "import sys; print(sys.prefix);" - OUTPUT_VARIABLE CMAKE_INSTALL_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c - "from distutils import sysconfig as s; import os; import sys; cmake_install_prefix = sys.prefix; prefix_lib = s.get_config_var('LIBDIR'); print(prefix_lib.replace(os.path.commonpath([prefix_lib, cmake_install_prefix]), '').strip('/'));" - OUTPUT_VARIABLE CMAKE_INSTALL_LIBDIR - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c - "from distutils import sysconfig as s; import os; prefix_lib = s.get_config_var('LIBDIR'); spdir = s.get_python_lib(plat_specific=True); print(spdir.replace(os.path.commonpath([prefix_lib, spdir]), ''));" - OUTPUT_VARIABLE PYMOD_INSTALL_LIBDIR - OUTPUT_STRIP_TRAILING_WHITESPACE) - endif() - - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c - "from numpy import distutils; print(distutils.misc_util.get_shared_lib_extension(is_python_ext=False))" - OUTPUT_VARIABLE PYLIB_EXTENSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - - install(DIRECTORY gau2grid - DESTINATION ${CMAKE_INSTALL_LIBDIR}${PYMOD_INSTALL_LIBDIR} - USE_SOURCE_PERMISSIONS - FILES_MATCHING PATTERN "*.py") - - install(FILES $ - DESTINATION ${CMAKE_INSTALL_LIBDIR}${PYMOD_INSTALL_LIBDIR}/gau2grid - RENAME "gg${PYLIB_EXTENSION}") - - install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE - DESTINATION ${CMAKE_INSTALL_LIBDIR}${PYMOD_INSTALL_LIBDIR}/gau2grid) -endif() diff --git a/third_party/gauxc/external/gau2grid/src/LICENSE b/third_party/gauxc/external/gau2grid/src/LICENSE deleted file mode 100644 index 3eba99f..0000000 --- a/third_party/gauxc/external/gau2grid/src/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2017, Daniel Smith -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/gauxc/external/gau2grid/src/MANIFEST.in b/third_party/gauxc/external/gau2grid/src/MANIFEST.in deleted file mode 100644 index d90c871..0000000 --- a/third_party/gauxc/external/gau2grid/src/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -recursive-include gau2grid *.py - -include setup.py -include README.md -include LICENSE -include MANIFEST.in - -include versioneer.py -include gau2grid/_version.py diff --git a/third_party/gauxc/external/gau2grid/src/README.md b/third_party/gauxc/external/gau2grid/src/README.md deleted file mode 100644 index 756d429..0000000 --- a/third_party/gauxc/external/gau2grid/src/README.md +++ /dev/null @@ -1,95 +0,0 @@ -

- - Travis CI - - - - Appveyor - - - - Codecov - - - - - - - - Documentation Status - -

- -# gau2grid -A collocation code for computing gaussians on a grid of the form: -``` -out_Lp = x^l y^m z^n \sum_i coeff_i e^(exponent_i * (|center - p|)^2) -``` -Where the returned matrix dimension are the angular momentum (L) by number of requested points (p). - -```python -import gau2grid -import numpy as np - -# Build coordinates along the Z axis ->>> xyz = np.zeros((3, 5)) ->>> xyz[2] = np.arange(5) - -# Compute a 's' gaussian with a scaling and exponent of one at the origin ->>> ret = gau2grid.collocation(xyz, 0, [1], [1], [0, 0, 0]) ->>> print(ret["PHI"]) -[[ 1.00000e+00 3.67879e-01 1.83156e-02 1.23409e-04 1.12535e-07]] - -# Compute a 'p' gaussian with a scaling and exponent of one at the origin ->>> ret = gau2grid.collocation(xyz, 1, [1], [1], [0, 0, 0], spherical=False) ->>> print(ret["PHI"]) -[[ 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00] - [ 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00] - [ 0.00000e+00 3.67879e-01 3.66312e-02 3.70229e-04 4.50140e-07]] - -# Note that the X and Y components are zero as they are orthogonal to our Z vector. -``` - -The returned matrix can be in either cartesian or regular solid harmonics. There are currently -three algorithms in which to compute these collocation matrices: - - Optimize C: A autogenerated C library that optimizes for cache, - vectorization, and sparsity. Fastest, requires compilation, found at - `gau2grid.collocation`. -- Optimized/Generated NumPy: A exploratory tool to - examine the sparsity in the gaussians. No compilation required, found at - `gau2grid.np_gen.collocation`. -- NumPy Reference: A simple NumPy-based loop - code. No compilation required, found at `gau2grid.ref.collocation`. - -See the [documentation](https://gau2grid.readthedocs.io/en/latest/?badge=latest) for more information! - -## Building Gau2Grid -The C library is built with CMake and has C no required dependancies other than -the standard library. A CMake and build example can found below: - -```bash -cmake -H. -Bobjdir -cd objdir; make -j2 -``` - -Several common CMake options are as follow: - - `-DPYTHON_EXECUTABLE` - Path to the desired Python executable - - `-DMAX_AM` - Maximum angular momentum to compile to, default 6 - - `-DCMAKE_INSTALL_PREFIX` - Installation directory - -## Python installation -The gau2grid program (without the optimized C library) can be installed using -the canonical `setup.py` script, -``` -python setup.py install -``` - -# Authors -This code was inspired by a number of folks and quite a few provided excellent advice. - - - Daniel G. A. Smith - Code author - - Rob M. Parrish - Author of the Psi4 section which contains the original equations - - Lori A. Burns - CMake, building, and library linking - - Andy C. Simmonett - RSH coefficients - - Ben Pritchard - Generator and vectorization recommendations - diff --git a/third_party/gauxc/external/gau2grid/src/appveyor.yml b/third_party/gauxc/external/gau2grid/src/appveyor.yml deleted file mode 100644 index adac81d..0000000 --- a/third_party/gauxc/external/gau2grid/src/appveyor.yml +++ /dev/null @@ -1,34 +0,0 @@ -image: Visual Studio 2017 -clone_depth: 5 - -install: - - call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" - - C:\Miniconda36-x64\Scripts\activate base - - conda install --yes numpy pytest - - conda list - -before_build: - - set SOURCE_FOLDER=%APPVEYOR_BUILD_FOLDER% - - set BUILD_FOLDER=%SOURCE_FOLDER%\build - - set INSTALL_FOLDER=%SOURCE_FOLDER%\install - - mkdir %BUILD_FOLDER% & cd %BUILD_FOLDER% - - cmake -A x64 - -DCMAKE_C_FLAGS="/wd4018 /wd4101 /wd4996" - -DCMAKE_WINDOWS_EXPORT_ALL_SYMBOLS=true - -DCMAKE_INSTALL_PREFIX=%INSTALL_FOLDER% - -DINSTALL_PYMOD=ON - .. - -build_script: - - cmake --build . - -after_build: - - cmake --build . --target install - -before_test: - - cd .. - - set PYTHONPATH=%INSTALL_FOLDER%\lib - -test_script: - - set GAU2GRID_FORCE_C_TEST=1 - - pytest -rws -v %INSTALL_FOLDER% diff --git a/third_party/gauxc/external/gau2grid/src/cmake/FindPythonLibsNew.cmake b/third_party/gauxc/external/gau2grid/src/cmake/FindPythonLibsNew.cmake deleted file mode 100644 index dc44a9d..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/FindPythonLibsNew.cmake +++ /dev/null @@ -1,194 +0,0 @@ -# - Find python libraries -# This module finds the libraries corresponding to the Python interpeter -# FindPythonInterp provides. -# This code sets the following variables: -# -# PYTHONLIBS_FOUND - have the Python libs been found -# PYTHON_PREFIX - path to the Python installation -# PYTHON_LIBRARIES - path to the python library -# PYTHON_INCLUDE_DIRS - path to where Python.h is found -# PYTHON_MODULE_EXTENSION - lib extension, e.g. '.so' or '.pyd' -# PYTHON_MODULE_PREFIX - lib name prefix: usually an empty string -# PYTHON_SITE_PACKAGES - path to installation site-packages -# PYTHON_IS_DEBUG - whether the Python interpreter is a debug build -# -# Thanks to talljimbo for the patch adding the 'LDVERSION' config -# variable usage. - -#============================================================================= -# Copyright 2001-2009 Kitware, Inc. -# Copyright 2012 Continuum Analytics, Inc. -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the names of Kitware, Inc., the Insight Software Consortium, -# nor the names of their contributors may be used to endorse or promote -# products derived from this software without specific prior written -# permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -if(PYTHONLIBS_FOUND) - return() -endif() - -# Use the Python interpreter to find the libs. -if(PythonLibsNew_FIND_REQUIRED) - find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED) -else() - find_package(PythonInterp ${PythonLibsNew_FIND_VERSION}) -endif() - -if(NOT PYTHONINTERP_FOUND) - set(PYTHONLIBS_FOUND FALSE) - return() -endif() - -# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter -# testing whether sys has the gettotalrefcount function is a reliable, cross-platform -# way to detect a CPython debug interpreter. -# -# The library suffix is from the config var LDVERSION sometimes, otherwise -# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows. -execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" - "from distutils import sysconfig as s;import sys;import struct; -print('.'.join(str(v) for v in sys.version_info)); -print(sys.prefix); -print(s.get_python_inc(plat_specific=True)); -print(s.get_python_lib(plat_specific=True)); -print(s.get_config_var('SO')); -print(hasattr(sys, 'gettotalrefcount')+0); -print(struct.calcsize('@P')); -print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); -print(s.get_config_var('LIBDIR') or ''); -print(s.get_config_var('MULTIARCH') or ''); -" - RESULT_VARIABLE _PYTHON_SUCCESS - OUTPUT_VARIABLE _PYTHON_VALUES - ERROR_VARIABLE _PYTHON_ERROR_VALUE) - -if(NOT _PYTHON_SUCCESS MATCHES 0) - if(PythonLibsNew_FIND_REQUIRED) - message(FATAL_ERROR - "Python config failure:\n${_PYTHON_ERROR_VALUE}") - endif() - set(PYTHONLIBS_FOUND FALSE) - return() -endif() - -# Convert the process output into a list -string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) -string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) -list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST) -list(GET _PYTHON_VALUES 1 PYTHON_PREFIX) -list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR) -list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES) -list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION) -list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG) -list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P) -list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX) -list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR) -list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH) - -# Make sure the Python has the same pointer-size as the chosen compiler -# Skip if CMAKE_SIZEOF_VOID_P is not defined -if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}")) - if(PythonLibsNew_FIND_REQUIRED) - math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8") - math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8") - message(FATAL_ERROR - "Python config failure: Python is ${_PYTHON_BITS}-bit, " - "chosen compiler is ${_CMAKE_BITS}-bit") - endif() - set(PYTHONLIBS_FOUND FALSE) - return() -endif() - -# The built-in FindPython didn't always give the version numbers -string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST}) -list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR) -list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR) -list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH) - -# Make sure all directory separators are '/' -string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) -string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR ${PYTHON_INCLUDE_DIR}) -string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES}) - -if(CMAKE_HOST_WIN32) - set(PYTHON_LIBRARY - "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") - - # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the - # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. - if(NOT EXISTS "${PYTHON_LIBRARY}") - get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) - set(PYTHON_LIBRARY - "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") - endif() - - # raise an error if the python libs are still not found. - if(NOT EXISTS "${PYTHON_LIBRARY}") - message(FATAL_ERROR "Python libraries not found") - endif() - -else() - if(PYTHON_MULTIARCH) - set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}") - else() - set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}") - endif() - #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}") - # Probably this needs to be more involved. It would be nice if the config - # information the python interpreter itself gave us were more complete. - find_library(PYTHON_LIBRARY - NAMES "python${PYTHON_LIBRARY_SUFFIX}" - PATHS ${_PYTHON_LIBS_SEARCH} - NO_DEFAULT_PATH) - - # If all else fails, just set the name/version and let the linker figure out the path. - if(NOT PYTHON_LIBRARY) - set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX}) - endif() -endif() - -MARK_AS_ADVANCED( - PYTHON_LIBRARY - PYTHON_INCLUDE_DIR -) - -# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the -# cache entries because they are meant to specify the location of a single -# library. We now set the variables listed by the documentation for this -# module. -SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}") -SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") -SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}") - -find_package_message(PYTHON - "Found PythonLibs: ${PYTHON_LIBRARY}" - "${PYTHON_EXECUTABLE}${PYTHON_VERSION}") - -set(PYTHONLIBS_FOUND TRUE) diff --git a/third_party/gauxc/external/gau2grid/src/cmake/FindStandardMathLibraryC.cmake b/third_party/gauxc/external/gau2grid/src/cmake/FindStandardMathLibraryC.cmake deleted file mode 100644 index 72310b3..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/FindStandardMathLibraryC.cmake +++ /dev/null @@ -1,54 +0,0 @@ -# * downloaded Nov 2016 from https://android.googlesource.com/platform/external/eigen/+/master/cmake/FindStandardMathLibrary.cmake -# * changed CXX to C -# * note that full path to libm *not* detected - -# - Try to find how to link to the standard math library, if anything at all is needed to do. -# On most platforms this is automatic, but for example it's not automatic on QNX. -# -# Once done this will define -# -# STANDARD_MATH_LIBRARY_FOUND - we found how to successfully link to the standard math library -# STANDARD_MATH_LIBRARY - the name of the standard library that one has to link to. -# -- this will be left empty if it's automatic (most platforms). -# -- this will be set to "m" on platforms where one must explicitly -# pass the "-lm" linker flag. -# -# Copyright (c) 2010 Benoit Jacob -# Redistribution and use is allowed according to the terms of the 2-clause BSD license. -include(CheckCSourceCompiles) -# a little test program for c++ math functions. -# notice the std:: is required on some platforms such as QNX -set(find_standard_math_library_test_program -"#include -int main() { sin(0.0); log(0.0f); }") -# C++ test program -# "#include -# int main() { std::sin(0.0); std::log(0.0f); }") -# first try compiling/linking the test program without any linker flags -set(CMAKE_REQUIRED_FLAGS "") -set(CMAKE_REQUIRED_LIBRARIES "") -CHECK_C_SOURCE_COMPILES( - "${find_standard_math_library_test_program}" - standard_math_library_linked_to_automatically -) -if(standard_math_library_linked_to_automatically) - # the test program linked successfully without any linker flag. - set(STANDARD_MATH_LIBRARY "") - set(STANDARD_MATH_LIBRARY_FOUND TRUE) -else() - # the test program did not link successfully without any linker flag. - # This is a very uncommon case that so far we only saw on QNX. The next try is the - # standard name 'm' for the standard math library. - set(CMAKE_REQUIRED_LIBRARIES "m") - CHECK_C_SOURCE_COMPILES( - "${find_standard_math_library_test_program}" - standard_math_library_linked_to_as_m) - if(standard_math_library_linked_to_as_m) - # the test program linked successfully when linking to the 'm' library - set(STANDARD_MATH_LIBRARY "m") - set(STANDARD_MATH_LIBRARY_FOUND TRUE) - else() - # the test program still doesn't link successfully - set(STANDARD_MATH_LIBRARY_FOUND FALSE) - endif() -endif() diff --git a/third_party/gauxc/external/gau2grid/src/cmake/autocmake_safeguards.cmake b/third_party/gauxc/external/gau2grid/src/cmake/autocmake_safeguards.cmake deleted file mode 100644 index 7c0a2a9..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/autocmake_safeguards.cmake +++ /dev/null @@ -1,26 +0,0 @@ -# Downloaded from -# https://github.com/coderefinery/autocmake/blob/master/modules/safeguards.cmake -# * changed text of in-source message - -#.rst: -# -# Provides safeguards against in-source builds and bad build types. -# -# Variables used:: -# -# PROJECT_SOURCE_DIR -# PROJECT_BINARY_DIR -# CMAKE_BUILD_TYPE - -if(${PROJECT_SOURCE_DIR} STREQUAL ${PROJECT_BINARY_DIR}) - message(FATAL_ERROR "In-source builds not allowed. Please run CMake from top directory and specify a build directory (e.g., cmake -H. -Bbuild).") -endif() - -string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower) -string(TOUPPER "${CMAKE_BUILD_TYPE}" cmake_build_type_toupper) - -if(NOT cmake_build_type_tolower STREQUAL "debug" AND - NOT cmake_build_type_tolower STREQUAL "release" AND - NOT cmake_build_type_tolower STREQUAL "relwithdebinfo") - message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).") -endif() diff --git a/third_party/gauxc/external/gau2grid/src/cmake/custom_color_messages.cmake b/third_party/gauxc/external/gau2grid/src/cmake/custom_color_messages.cmake deleted file mode 100644 index 7daf7e6..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/custom_color_messages.cmake +++ /dev/null @@ -1,38 +0,0 @@ -# http://stackoverflow.com/a/19578320 - -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -#message("This is normal") -#message("${Red}This is Red${ColourReset}") -#message("${Green}This is Green${ColourReset}") -#message("${Yellow}This is Yellow${ColourReset}") -#message("${Blue}This is Blue${ColourReset}") -#message("${Magenta}This is Magenta${ColourReset}") -#message("${Cyan}This is Cyan${ColourReset}") -#message("${White}This is White${ColourReset}") -#message("${BoldRed}This is BoldRed${ColourReset}") -#message("${BoldGreen}This is BoldGreen${ColourReset}") -#message("${BoldYellow}This is BoldYellow${ColourReset}") -#message("${BoldBlue}This is BoldBlue${ColourReset}") -#message("${BoldMagenta}This is BoldMagenta${ColourReset}") -#message("${BoldCyan}This is BoldCyan${ColourReset}") -#message("${BoldWhite}This is BoldWhite\n\n${ColourReset}") - diff --git a/third_party/gauxc/external/gau2grid/src/cmake/custom_static_library.cmake b/third_party/gauxc/external/gau2grid/src/cmake/custom_static_library.cmake deleted file mode 100644 index d52f98b..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/custom_static_library.cmake +++ /dev/null @@ -1,56 +0,0 @@ -# Downloaded from -# https://github.com/PCMSolver/pcmsolver/blob/release/1.Y/cmake/custom/static_library.cmake -# * suppressed STATIC_LIBRARY_ONLY -# * moved option up -# * corrected CXX block matches statements from C --> CXX compiler - -#.rst: -# -# Enables creation of static library. -# If the shared library is created, make it as static as possible. -# -# Variables modified (provided the corresponding language is enabled):: -# -# CMAKE_Fortran_FLAGS -# CMAKE_C_FLAGS -# CMAKE_CXX_FLAGS -# -# autocmake.cfg configuration:: -# -# docopt: --static Create only the static library [default: False]. -# define: '-DSTATIC_LIBRARY_ONLY=%s' % arguments['--static'] - -if(ENABLE_GENERIC) - if(DEFINED CMAKE_Fortran_COMPILER_ID) - if(CMAKE_Fortran_COMPILER_ID MATCHES GNU) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -static-libgfortran") - endif() - if(CMAKE_Fortran_COMPILER_ID MATCHES Intel) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -static-libgcc -static-intel") - endif() - endif() - - if(DEFINED CMAKE_C_COMPILER_ID) - if(CMAKE_C_COMPILER_ID MATCHES GNU) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -static-libgcc -fpic") - endif() - if(CMAKE_C_COMPILER_ID MATCHES Intel) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -static-libgcc -static-intel -wd10237") - endif() - if(CMAKE_C_COMPILER_ID MATCHES Clang) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fpic") - endif() - endif() - - if(DEFINED CMAKE_CXX_COMPILER_ID) - if(CMAKE_CXX_COMPILER_ID MATCHES GNU) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++ -static-libgcc") - endif() - if(CMAKE_CXX_COMPILER_ID MATCHES Intel) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--as-needed -static-libstdc++ -static-libgcc -static-intel -wd10237") - endif() - if(CMAKE_CXX_COMPILER_ID MATCHES Clang) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++") - endif() - endif() -endif() diff --git a/third_party/gauxc/external/gau2grid/src/cmake/gau2gridConfig.cmake.in b/third_party/gauxc/external/gau2grid/src/cmake/gau2gridConfig.cmake.in deleted file mode 100644 index fb43424..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/gau2gridConfig.cmake.in +++ /dev/null @@ -1,73 +0,0 @@ -# gau2gridConfig.cmake -# -------------------- -# -# GAU2GRID cmake module. -# This module sets the following variables in your project:: -# -# gau2grid_FOUND - true if gau2grid and all required components found on the system -# gau2grid_VERSION - gau2grid version in format Major.Minor.Release -# gau2grid_INCLUDE_DIRS - Directory where gau2grid header is located. -# gau2grid_INCLUDE_DIR - same as DIRS -# gau2grid_LIBRARIES - gau2grid library to link against. -# gau2grid_LIBRARY - same as LIBRARIES -# -# -# Available components: -# -# shared - search for only shared library -# static - search for only static library -# -# -# Exported targets:: -# -# If gau2grid is found, this module defines the following :prop_tgt:`IMPORTED` -# target. Target is shared _or_ static, so, for both, use separate, not -# overlapping, installations. :: -# -# gau2grid::gg - the main gau2grid library with header attached. -# -# -# Suggested usage:: -# -# find_package(gau2grid) -# find_package(gau2grid 1.0.1 EXACT CONFIG REQUIRED) -# -# -# The following variables can be set to guide the search for this package:: -# -# gau2grid_DIR - CMake variable, set to directory containing this Config file -# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package -## PATH - environment variable, set to bin directory of this package -# CMAKE_DISABLE_FIND_PACKAGE_gau2grid - CMake variable, disables -# find_package(gau2grid) when not REQUIRED, perhaps to force internal build - -@PACKAGE_INIT@ - -set(PN gau2grid) - -if(@BUILD_SHARED_LIBS@) - set(${PN}_shared_FOUND 1) -else() - set(${PN}_static_FOUND 1) -endif() - -check_required_components(${PN}) - -#----------------------------------------------------------------------------- -# Don't include targets if this file is being picked up by another -# project which has already built this as a subproject -#----------------------------------------------------------------------------- -if(NOT TARGET ${PN}::gg) - include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") - - get_property(_loc TARGET ${PN}::gg PROPERTY LOCATION) - set(${PN}_LIBRARY ${_loc}) - get_property(_ill TARGET ${PN}::gg PROPERTY INTERFACE_LINK_LIBRARIES) - set(${PN}_LIBRARIES ${_ill}) - - get_property(_id TARGET ${PN}::gg PROPERTY INCLUDE_DIRECTORIES) - set(${PN}_INCLUDE_DIR ${_id}) - get_property(_iid TARGET ${PN}::gg PROPERTY INTERFACE_INCLUDE_DIRECTORIES) - set(${PN}_INCLUDE_DIRS ${_iid}) -endif() - diff --git a/third_party/gauxc/external/gau2grid/src/cmake/psi4OptionsTools.cmake b/third_party/gauxc/external/gau2grid/src/cmake/psi4OptionsTools.cmake deleted file mode 100644 index eb3e58e..0000000 --- a/third_party/gauxc/external/gau2grid/src/cmake/psi4OptionsTools.cmake +++ /dev/null @@ -1,230 +0,0 @@ -###This file contains functions used throughout the Psi4 build. Like source -###code, the build system should be factored and common code extracted out into -###functions/macros. If you find repetitive code throughout the build scripts -###this is the place to add it (make sure you document it too). - -#Macro for printing an option in a consistent manner -# -#Syntax: print_option(