diff --git a/lisa/tools/lspci.py b/lisa/tools/lspci.py index c023b763f4..879b9b745f 100644 --- a/lisa/tools/lspci.py +++ b/lisa/tools/lspci.py @@ -8,14 +8,7 @@ from lisa.executable import Tool from lisa.operating_system import Posix from lisa.tools import Echo -from lisa.util import ( - LisaException, - constants, - find_group_in_lines, - find_groups_in_lines, - find_patterns_in_lines, - get_matched_str, -) +from lisa.util import LisaException, constants, find_patterns_in_lines, get_matched_str # Example output of lspci command - # lspci -m @@ -43,12 +36,16 @@ re.MULTILINE, ) -# With -mnn option, result would be with vendor/device id -# d8:00.0 "Ethernet controller [0200]" "Mellanox Technologies [15b3]" -# "MT27520 Family [ConnectX-3 Pro] [1007]" "Mellanox Technologies [15b3]" -# "Mellanox Technologies ConnectX-3 Pro Stand-up dual-port 40GbE MCX314A-BCCT [0006]" -PATTERN_DEVICE_ID = re.compile(r"\[(?P[^\]]{4})\]") - +# lspci -n +# 19e3:00:00.0 0108: 1414:b111 (rev 01) +# 2b5c:00:00.0 0108: 1414:b111 (rev 01) +# d2e9:00:00.0 0108: 1414:00a9 +# d3f4:00:02.0 0200: 15b3:101a (rev 80) +PATTERN_PCI_DEVICE_ID = re.compile( + r"^(?P[^\s]+)\s+(?P[0-9a-fA-F]{4}):\s+" + r"(?P[0-9a-fA-F]{4}):(?P[0-9a-fA-F]{4})", + re.MULTILINE, +) DEVICE_TYPE_DICT: Dict[str, List[str]] = { constants.DEVICE_TYPE_SRIOV: ["Ethernet controller"], @@ -60,6 +57,51 @@ constants.DEVICE_TYPE_GPU: ["NVIDIA Corporation"], } +DEVICE_ID_DICT: Dict[str, List[str]] = { + constants.DEVICE_TYPE_SRIOV: [ + "1004", # Mellanox Technologies MT27500/MT27520 Family [ConnectX-3/ConnectX-3 Pro Virtual Function] # noqa: E501 + "1016", # Mellanox Technologies MT27710 Family [ConnectX-4 Lx Virtual Function] + "101a", # Mellanox Technologies MT28800 Family [ConnectX-5 Ex Virtual Function] + "101e", # Mellanox Technologies [ConnectX Family mlx5Gen Virtual Function] + ], + constants.DEVICE_TYPE_NVME: [ + "b111" # Microsoft Corporation Device, Local NVMe discs + ], + constants.DEVICE_TYPE_ASAP: [ + "00a9" # Remote discs connected using NVMe disc controller + ], + constants.DEVICE_TYPE_GPU: [ + "1db4", # NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB] + "1eb8", # NVIDIA Corporation TU104GL [Tesla T4] + "13f2", # NVIDIA Corporation GM204GL [Tesla M60] + "74b5", # Advanced Micro Devices, Inc. [AMD/ATI] + ], +} + +VENDOR_ID_DICT: Dict[str, List[str]] = { + constants.DEVICE_TYPE_SRIOV: [ + "1414", # Microsoft Corporation + "15b3", # Mellanox Technologies + ], + constants.DEVICE_TYPE_NVME: ["1414"], # Microsoft Corporation + constants.DEVICE_TYPE_GPU: ["10de"], # NVIDIA Corporation + constants.DEVICE_TYPE_AMD_GPU: ["1002"], # Advanced Micro Devices, Inc. [AMD/ATI] +} + +CONTROLLER_ID_DICT: Dict[str, List[str]] = { + constants.DEVICE_TYPE_SRIOV: [ + "0200", # Ethernet controller + "0207", # Infiniband controller + ], + constants.DEVICE_TYPE_NVME: [ + "0108", # Non-Volatile memory controller + ], + constants.DEVICE_TYPE_GPU: [ + "0302", # VGA compatible controller + "1200", # Processing accelerators (AMD GPU) + ], +} + # Kernel driver in use: mlx4_core # Kernel driver in use: mlx5_core # Kernel driver in use: mlx4_core\r @@ -78,43 +120,25 @@ def __str__(self) -> str: f"vendor: {self.vendor}, " f"info: {self.device_info}, " f"vendor_id: {self.vendor_id}, " - f"device_id: {self.device_id}" + f"device_id: {self.device_id}, " + f"controller_id: {self.controller_id} " ) def parse(self, raw_str: str) -> None: - matched_pci_device_info_list = find_groups_in_lines( - lines=raw_str, - pattern=PATTERN_PCI_DEVICE, - ) - if matched_pci_device_info_list: - matched_pci_device_info = matched_pci_device_info_list[0] - self.slot = matched_pci_device_info.get("slot", "").strip() + matched_pci_device_info = PATTERN_PCI_DEVICE.match(raw_str) + if matched_pci_device_info: + self.slot = matched_pci_device_info.group("slot") assert self.slot, f"Can not find slot info for: {raw_str}" - - device_class = matched_pci_device_info.get("device_class", "") - assert device_class, f"Can not find device class for: {raw_str}" - self.device_class = PATTERN_DEVICE_ID.sub("", device_class).strip() - - vendor = matched_pci_device_info.get("vendor", "") - assert vendor, f"Can not find vendor info for: {raw_str}" - vendor_id_raw = find_group_in_lines( - lines=vendor, - pattern=PATTERN_DEVICE_ID, - single_line=False, - ) - self.vendor_id = vendor_id_raw.get("id", "") - assert self.vendor_id, f"cannot find vendor id from {raw_str}" - self.vendor = PATTERN_DEVICE_ID.sub("", vendor).strip() - - self.device_info = matched_pci_device_info.get("device", "") + self.device_class = matched_pci_device_info.group("device_class") + assert self.device_class, f"Can not find device class for: {raw_str}" + self.vendor = matched_pci_device_info.group("vendor") + assert self.vendor, f"Can not find vendor info for: {raw_str}" + self.device_info = matched_pci_device_info.group("device") assert self.device_info, f"Can not find device info for: {raw_str}" - device_id_raw = find_group_in_lines( - lines=self.device_info, - pattern=PATTERN_DEVICE_ID, - single_line=False, - ) - self.device_id = device_id_raw.get("id", "") - assert self.device_id, f"cannot find device id from {raw_str}" + # Initialize the device_id, vendor_id and controller_id to None + self.vendor_id = "" + self.device_id = "" + self.controller_id = "" else: raise LisaException("cannot find any matched pci devices") @@ -146,9 +170,12 @@ def get_device_names_by_type( ) -> List[str]: if device_type.upper() not in DEVICE_TYPE_DICT.keys(): raise LisaException(f"pci_type '{device_type}' is not recognized.") - class_names = DEVICE_TYPE_DICT[device_type.upper()] devices_list = self.get_devices(force_run) - devices_slots = [x.slot for x in devices_list if x.device_class in class_names] + devices_slots = [] + + for device in devices_list: + if device.device_id in DEVICE_ID_DICT[device_type.upper()]: + devices_slots.append(device.slot) return devices_slots def get_devices_by_type( @@ -158,18 +185,52 @@ def get_devices_by_type( raise LisaException( f"pci_type '{device_type}' is not supported to be searched." ) - class_names = DEVICE_TYPE_DICT[device_type.upper()] devices_list = self.get_devices(force_run) - device_type_list = [x for x in devices_list if x.device_class in class_names] + device_type_list = [] + for device in devices_list: + if device.device_id in DEVICE_ID_DICT[device_type.upper()]: + device_type_list.append(device) + return device_type_list + @retry(KeyError, tries=10, delay=10) def get_devices(self, force_run: bool = False) -> List[PciDevice]: if (not self._pci_devices) or force_run: self._pci_devices = [] + self._pci_ids = {} # Ensure pci device ids and name mappings are updated. self.node.execute("update-pciids", sudo=True, shell=True) + + # Fetching the id information using 'lspci -nnm' is not reliable + # due to inconsistencies in device id patterns. + # Example output of 'lspci -nnm': + # d2e9:00:00.0 "Non-Volatile memory controller [0108]" "Microsoft Corporation [1414]" "Device [00a9]" -p02 "Microsoft Corporation [1414]" "Device [0000]" # noqa: E501 + # d3f4:00:02.0 "Ethernet controller [0200]" "Mellanox Technologies [15b3]" "MT28800 Family [ConnectX-5 Ex Virtual Function] [101a]" -r80 "Mellanox Technologies [15b3]" "MT28800 Family [ConnectX-5 Ex Virtual Function] [0127]" # noqa: E501 + # Sample 'lspci -n' output for above devices: + # d2e9:00:00.0 0108: 1414:00a9 + # d3f4:00:02.0 0200: 15b3:101a (rev 80) + # Fetch pci ids using 'lspci -n': result = self.run( - "-Dmnn", + "-n", + force_run=force_run, + shell=True, + expected_exit_code=0, + sudo=True, + ) + for pci_raw in result.stdout.splitlines(): + pci_device_id_info = {} + matched_pci_device_info = PATTERN_PCI_DEVICE_ID.match(pci_raw) + if matched_pci_device_info: + pci_device_id_info[matched_pci_device_info.group("slot")] = { + "device_id": matched_pci_device_info.group("device_id"), + "vendor_id": matched_pci_device_info.group("vendor_id"), + "controller_id": matched_pci_device_info.group("controller_id"), + } + self._pci_ids.update(pci_device_id_info) + + # Fetching the device information using 'lspci -m': + result = self.run( + "-m", force_run=force_run, shell=True, expected_exit_code=0, @@ -179,6 +240,21 @@ def get_devices(self, force_run: bool = False) -> List[PciDevice]: pci_device = PciDevice(pci_raw) self._pci_devices.append(pci_device) + for i in range(len(self._pci_devices)): + pci_slot_id = self._pci_devices[i].slot + # Sometimes the list of devices is not same from above 'lspci -n' output + # and 'lspci -m' outputs. + # It usually happens when the VM is just finished booting and not + # all PCI devices are detected. For example SRIOV devices. + # In such cases we need to retry after a short delay. + if pci_slot_id not in self._pci_ids: + raise KeyError(f"cannot find device id from {pci_slot_id}") + self._pci_devices[i].device_id = self._pci_ids[pci_slot_id]["device_id"] + self._pci_devices[i].vendor_id = self._pci_ids[pci_slot_id]["vendor_id"] + self._pci_devices[i].controller_id = self._pci_ids[pci_slot_id][ + "controller_id" + ] + return self._pci_devices def disable_devices_by_type(self, device_type: str) -> int: diff --git a/lisa/util/constants.py b/lisa/util/constants.py index 0ff2349cfa..771c5317eb 100644 --- a/lisa/util/constants.py +++ b/lisa/util/constants.py @@ -159,6 +159,8 @@ DEVICE_TYPE_SRIOV = "SRIOV" DEVICE_TYPE_NVME = "NVME" DEVICE_TYPE_GPU = "GPU" +DEVICE_TYPE_AMD_GPU = "AMD_GPU" +DEVICE_TYPE_ASAP = "ASAP" DISK_PERFORMANCE_TOOL_FIO = "fio" NETWORK_PERFORMANCE_TOOL_NTTTCP = "ntttcp"