Merge pull request #2051 from devitocodes/add_icx_support

FabioLuporini · web-flow · commit 06786270675b · 2023-06-06T07:43:41.000+02:00
compiler: Add ICX support
diff --git a/.github/workflows/docker-bases.yml b/.github/workflows/docker-bases.yml
@@ -66,7 +66,7 @@ jobs:
             dockerfile: './docker/Dockerfile.cpu'
             runner: ubuntu-latest 
           
-          - tag: 'devitocodes/bases:cpu-icc'
+          - tag: 'devitocodes/bases:cpu-icc, devitocodes/bases:cpu-icx'
             arch: 'arch=icc'
             version: ''
             dockerfile: './docker/Dockerfile.cpu'
diff --git a/.github/workflows/pytest-core-nompi.yml b/.github/workflows/pytest-core-nompi.yml
@@ -38,7 +38,8 @@ jobs:
            pytest-ubuntu-py39-gcc9-omp,
            pytest-osx-py37-clang-omp,
            pytest-docker-py37-gcc-omp,
-           pytest-docker-py37-icc-omp
+           pytest-docker-py37-icc-omp,
+           pytest-docker-py38-icx-omp
         ]
         set: [base, adjoint]
         include:
@@ -105,6 +106,13 @@ jobs:
           language: "openmp"
           sympy: "1.11"
 
+        - name: pytest-docker-py38-icx-omp
+          python-version: '3.8'
+          os: ubuntu-22.04
+          arch: "icx"
+          language: "openmp"
+          sympy: "1.11"
+
         - set: base
           test-set: 'not adjoint'
 
@@ -133,13 +141,13 @@ jobs:
     - name: Set run prefix
       run: |
           if [[ "${{ matrix.name }}" =~ "docker" ]]; then
-              echo "RUN_CMD=docker run --rm -e CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} --name testrun devito_img"  >> $GITHUB_ENV
+              echo "RUN_CMD=docker run --rm -e CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} -e DEVITO_ARCH=${{ matrix.arch }} --name testrun devito_img"  >> $GITHUB_ENV
           else
               echo "RUN_CMD=" >> $GITHUB_ENV
           fi
       id: set-run
 
-    - name: Install GCC ${{ matrix.arch }}
+    - name: Install ${{ matrix.arch }} compiler
       if: "runner.os == 'linux' && !contains(matrix.name, 'docker')"
       run : |
         sudo apt-get install -y ${{ matrix.arch }}
diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
@@ -16,12 +16,17 @@
 
 __all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_nvidia_cc',
            'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
-           'Platform', 'Cpu64', 'Intel64', 'Amd', 'Arm', 'Power', 'Device',
-           'NvidiaDevice', 'AmdDevice', 'IntelDevice',
-           'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'KNL', 'KNL7210',  # Intel
-           'AMD', 'ARM', 'M1', 'GRAVITON',  # ARM
-           'POWER8', 'POWER9',  # Other loosely supported CPU architectures
-           'AMDGPUX', 'NVIDIAX', 'INTELGPUX']  # GPUs
+           'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
+           'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
+           # Intel
+           'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
+           'SKX', 'KLX', 'CLX', 'CLK',
+           # ARM
+           'AMD', 'ARM', 'M1', 'GRAVITON',
+           # Other loosely supported CPU architectures
+           'POWER8', 'POWER9',
+           # GPUs
+           'AMDGPUX', 'NVIDIAX', 'INTELGPUX']
 
 
 @memoized_func
@@ -494,7 +499,7 @@ def get_platform():
             if 'phi' in brand:
                 # Intel Xeon Phi?
                 return platform_registry['knl']
-            # Unknown Xeon ? May happen on some virtualizes systems...
+            # Unknown Xeon ? May happen on some virtualized systems...
             return platform_registry['intel64']
         elif 'intel' in brand:
             # Most likely a desktop i3/i5/i7
@@ -607,6 +612,14 @@ class Intel64(Cpu64):
     known_isas = ('cpp', 'sse', 'avx', 'avx2', 'avx512')
 
 
+class IntelSkylake(Intel64):
+    pass
+
+
+class IntelGoldenCode(Intel64):
+    pass
+
+
 class Arm(Cpu64):
 
     known_isas = ('fp', 'asimd', 'asimdrdm')
@@ -725,11 +738,12 @@ def march(cls):
 IVB = Intel64('ivb')
 HSW = Intel64('hsw')
 BDW = Intel64('bdw', isa='avx2')
-SKX = Intel64('skx')
-KLX = Intel64('klx')
-CLX = Intel64('clx')
 KNL = Intel64('knl')
 KNL7210 = Intel64('knl', cores_logical=256, cores_physical=64, isa='avx512')
+SKX = IntelSkylake('skx')
+KLX = IntelSkylake('klx')
+CLX = IntelSkylake('clx')
+CLK = IntelSkylake('clk')
 
 ARM = Arm('arm')
 GRAVITON = Arm('graviton')
@@ -756,6 +770,7 @@ def march(cls):
     'skx': SKX,  # Skylake
     'klx': KLX,  # Kaby Lake
     'clx': CLX,  # Coffee Lake
+    'clk': CLK,  # Cascade Lake
     'knl': KNL,
     'knl7210': KNL7210,
     'arm': ARM,  # Generic ARM CPU
diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py
@@ -12,8 +12,9 @@
 from codepy.jit import compile_from_string
 from codepy.toolchain import GCCToolchain
 
-from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, SKX, POWER8, POWER9, GRAVITON,
-                         get_nvidia_cc, check_cuda_runtime, get_m1_llvm_path)
+from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, POWER8, POWER9, GRAVITON,
+                         INTELGPUX, IntelSkylake, get_nvidia_cc, check_cuda_runtime,
+                         get_m1_llvm_path)
 from devito.exceptions import CompilationError
 from devito.logger import debug, warning, error
 from devito.parameters import configuration
@@ -375,13 +376,22 @@ class GNUCompiler(Compiler):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.cflags += ['-march=native', '-Wno-unused-result', '-Wno-unused-variable',
-                        '-Wno-unused-but-set-variable']
+        platform = kwargs.pop('platform', configuration['platform'])
+
+        self.cflags += ['-march=native', '-Wno-unused-result',
+                        '-Wno-unused-variable', '-Wno-unused-but-set-variable']
+
         if configuration['safe-math']:
             self.cflags.append('-fno-unsafe-math-optimizations')
         else:
             self.cflags.append('-ffast-math')
 
+        if isinstance(platform, IntelSkylake):
+            # The default is `=256` because avx512 slows down the CPU frequency;
+            # however, we empirically found that stencils generally benefit
+            # from `=512`
+            self.cflags.append('-mprefer-vector-width=512')
+
         language = kwargs.pop('language', configuration['language'])
         try:
             if self.version >= Version("4.9.0"):
@@ -414,7 +424,7 @@ def __init__(self, *args, **kwargs):
 class ClangCompiler(Compiler):
 
     def __init__(self, *args, **kwargs):
-        super(ClangCompiler, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
         self.cflags += ['-Wno-unused-result', '-Wno-unused-variable']
         if not configuration['safe-math']:
@@ -481,7 +491,7 @@ class AOMPCompiler(Compiler):
     """AMD's fork of Clang for OpenMP offloading on both AMD and NVidia cards."""
 
     def __init__(self, *args, **kwargs):
-        super(AOMPCompiler, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
         self.cflags += ['-Wno-unused-result', '-Wno-unused-variable']
         if not configuration['safe-math']:
@@ -531,7 +541,7 @@ def __lookup_cmds__(self):
 class PGICompiler(Compiler):
 
     def __init__(self, *args, **kwargs):
-        super(PGICompiler, self).__init__(*args, cpp=True, **kwargs)
+        super().__init__(*args, cpp=True, **kwargs)
 
         self.cflags.remove('-std=c99')
         self.cflags.remove('-O3')
@@ -671,39 +681,30 @@ def __lookup_cmds__(self):
 class IntelCompiler(Compiler):
 
     def __init__(self, *args, **kwargs):
-        super(IntelCompiler, self).__init__(*args, **kwargs)
-
-        self.cflags.append("-xhost")
+        super().__init__(*args, **kwargs)
 
-        language = kwargs.pop('language', configuration['language'])
         platform = kwargs.pop('platform', configuration['platform'])
+        language = kwargs.pop('language', configuration['language'])
+        self.cflags.append("-xHost")
 
         if configuration['safe-math']:
             self.cflags.append("-fp-model=strict")
         else:
-            self.cflags.append('-fast')
+            self.cflags.append('-fp-model=fast')
 
-        if platform is SKX:
+        if isinstance(platform, IntelSkylake):
             # Systematically use 512-bit vectors on skylake
             self.cflags.append("-qopt-zmm-usage=high")
 
-        try:
-            if self.version >= Version("15.0.0"):
-                # Append the OpenMP flag regardless of configuration['language'],
-                # since icc15 and later versions implement OpenMP 4.0, hence
-                # they support `#pragma omp simd`
-                self.ldflags.append('-qopenmp')
-        except (TypeError, ValueError):
-            if language == 'openmp':
-                # Note: fopenmp, not qopenmp, is what is needed by icc versions < 15.0
-                self.ldflags.append('-fopenmp')
+        if language == 'openmp':
+            self.ldflags.append('-qopenmp')
 
         # Make sure the MPI compiler uses `icc` underneath -- whatever the MPI distro is
         if kwargs.get('mpi'):
-            ver = check_output([self.MPICC, "--version"]).decode("utf-8")
-            if not ver.startswith("icc"):
-                warning("The MPI compiler `%s` doesn't use the Intel "
-                        "C/C++ compiler underneath" % self.MPICC)
+            mpi_distro = sniff_mpi_distro('mpiexec')
+            if mpi_distro != 'IntelMPI':
+                warning("Expected Intel MPI distribution with `%s`, but found `%s`"
+                        % (self.__class__.__name__, mpi_distro))
 
     def __lookup_cmds__(self):
         self.CC = 'icc'
@@ -727,16 +728,55 @@ def __lookup_cmds__(self):
 class IntelKNLCompiler(IntelCompiler):
 
     def __init__(self, *args, **kwargs):
-        super(IntelKNLCompiler, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
-        self.cflags += ["-xMIC-AVX512"]
+        self.cflags.append('-xMIC-AVX512')
 
         language = kwargs.pop('language', configuration['language'])
 
         if language != 'openmp':
             warning("Running on Intel KNL without OpenMP is highly discouraged")
 
 
+class OneapiCompiler(IntelCompiler):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        platform = kwargs.pop('platform', configuration['platform'])
+        language = kwargs.pop('language', configuration['language'])
+
+        if language == 'openmp':
+            self.ldflags.remove('-qopenmp')
+            self.ldflags.append('-fopenmp')
+
+        if language == 'sycl':
+            self.cflags.append('-fsycl')
+            if platform is NVIDIAX:
+                self.cflags.append('-fsycl-targets=nvptx64-cuda')
+            else:
+                self.cflags.append('-fsycl-targets=spir64')
+
+            if platform is NVIDIAX:
+                self.cflags.append('-fopenmp-targets=nvptx64-cuda')
+            if platform is INTELGPUX:
+                self.cflags.append('-fopenmp-targets=spir64')
+                self.cflags.append('-fopenmp-target-simd')
+
+        if platform is INTELGPUX:
+            self.cflags.remove('-g')  # -g disables some optimizations in IGC
+            self.cflags.append('-gline-tables-only')
+            self.cflags.append('-fdebug-info-for-profiling')
+
+    def __lookup_cmds__(self):
+        # OneAPI HPC ToolKit comes with icpx, which is clang++,
+        # and icx, which is clang
+        self.CC = 'icx'
+        self.CXX = 'icpx'
+        self.MPICC = 'mpicc'
+        self.MPICX = 'mpicx'
+
+
 class CustomCompiler(Compiler):
 
     """
@@ -800,9 +840,11 @@ def __lookup_cmds__(self):
     'nvidia': NvidiaCompiler,
     'cuda': CudaCompiler,
     'osx': ClangCompiler,
-    'intel': IntelCompiler,
-    'icpc': IntelCompiler,
+    'intel': OneapiCompiler,
+    'icx': OneapiCompiler,
+    'icpx': OneapiCompiler,
     'icc': IntelCompiler,
+    'icpc': IntelCompiler,
     'intel-knl': IntelKNLCompiler,
     'knl': IntelKNLCompiler,
     'dpcpp': DPCPPCompiler,
diff --git a/devito/parameters.py b/devito/parameters.py
@@ -235,8 +235,11 @@ class switchconfig(object):
     Decorator to temporarily change `configuration` parameters.
     """
 
-    def __init__(self, **params):
-        self.params = {k.replace('_', '-'): v for k, v in params.items()}
+    def __init__(self, condition=True, **params):
+        if condition:
+            self.params = {k.replace('_', '-'): v for k, v in params.items()}
+        else:
+            self.params = {}
 
     def __call__(self, func, *args, **kwargs):
         @wraps(func)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,7 +9,8 @@
 from devito.checkpointing import NoopRevolver
 from devito.finite_differences.differentiable import EvalDerivative
 from devito.arch import Cpu64, Device, sniff_mpi_distro, Arm
-from devito.arch.compiler import compiler_registry, IntelCompiler, NvidiaCompiler
+from devito.arch.compiler import (compiler_registry, IntelCompiler, OneapiCompiler,
+                                  NvidiaCompiler)
 from devito.ir.iet import (FindNodes, FindSymbols, Iteration, ParallelBlock,
                            retrieve_iteration_tree)
 from devito.tools import as_tuple
@@ -26,7 +27,8 @@ def skipif(items, whole_module=False):
     # Sanity check
     accepted = set()
     accepted.update({'device', 'device-C', 'device-openmp', 'device-openacc',
-                     'device-aomp', 'cpu64-icc', 'cpu64-nvc', 'cpu64-arm', 'chkpnt'})
+                     'device-aomp', 'cpu64-icc', 'cpu64-icx', 'cpu64-nvc', 'cpu64-arm',
+                     'cpu64-icpx', 'chkpnt'})
     accepted.update({'nompi', 'nodevice'})
     unknown = sorted(set(items) - accepted)
     if unknown:
@@ -70,6 +72,12 @@ def skipif(items, whole_module=False):
            isinstance(configuration['platform'], Cpu64):
             skipit = "`icc+cpu64` won't work with this test"
             break
+        # Skip if it won't run with OneAPICompiler
+        if i == 'cpu64-icx' and \
+           isinstance(configuration['compiler'], OneapiCompiler) and \
+           isinstance(configuration['platform'], Cpu64):
+            skipit = "`icx+cpu64` won't work with this test"
+            break
         # Skip if it won't run on Arm
         if i == 'cpu64-arm' and isinstance(configuration['platform'], Arm):
             skipit = "Arm doesn't support x86-specific instructions"
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -4,9 +4,11 @@
 
 from benchmarks.user.benchmark import run
 from devito import configuration, switchconfig
+from conftest import skipif
 from subprocess import check_call
 
 
+@skipif('cpu64-icx')
 @pytest.mark.parametrize('mode, problem, op', [
     ('run', 'acoustic', 'forward'), ('run', 'acoustic', 'adjoint'),
     ('run', 'acoustic', 'jacobian'), ('run', 'acoustic', 'jacobian_adjoint'),
diff --git a/tests/test_buffering.py b/tests/test_buffering.py
@@ -701,7 +701,6 @@ def test_everything():
     assert np.all(u.data == u1.data)
 
 
-@skipif('cpu64-icc')
 @pytest.mark.parametrize('subdomain', ['domain', 'interior'])
 def test_stencil_issue_1915(subdomain):
     nt = 5
diff --git a/tests/test_dimension.py b/tests/test_dimension.py
@@ -9,7 +9,8 @@
                     SparseFunction, SparseTimeFunction, Eq, Operator, Constant,
                     Dimension, DefaultDimension, SubDimension, switchconfig,
                     SubDomain, Lt, Le, Gt, Ge, Ne, Buffer, sin, SpaceDimension,
-                    CustomDimension, dimensions)
+                    CustomDimension, dimensions, configuration)
+from devito.arch.compiler import IntelCompiler, OneapiCompiler
 from devito.ir.iet import (Conditional, Expression, Iteration, FindNodes,
                            FindSymbols, retrieve_iteration_tree)
 from devito.symbolics import indexify, retrieve_functions, IntDiv
@@ -1382,6 +1383,8 @@ def test_affiness(self):
         iterations = [i for i in FindNodes(Iteration).visit(op) if i.dim is not time]
         assert all(i.is_Affine for i in iterations)
 
+    @switchconfig(condition=isinstance(configuration['compiler'],
+                  (IntelCompiler, OneapiCompiler)), safe_math=True)
     def test_sparse_time_function(self):
         nt = 20