Skip to content

Commit

Permalink
lspci changes
Browse files Browse the repository at this point in the history
  • Loading branch information
SRIKKANTH committed Oct 23, 2024
1 parent fc6c474 commit 1264464
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 51 deletions.
178 changes: 127 additions & 51 deletions lisa/tools/lspci.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,7 @@
from lisa.executable import Tool
from lisa.operating_system import Posix
from lisa.tools import Echo
from lisa.util import (
LisaException,
constants,
find_group_in_lines,
find_groups_in_lines,
find_patterns_in_lines,
get_matched_str,
)
from lisa.util import LisaException, constants, find_patterns_in_lines, get_matched_str

# Example output of lspci command -
# lspci -m
Expand Down Expand Up @@ -43,12 +36,16 @@
re.MULTILINE,
)

# With -mnn option, result would be with vendor/device id
# d8:00.0 "Ethernet controller [0200]" "Mellanox Technologies [15b3]"
# "MT27520 Family [ConnectX-3 Pro] [1007]" "Mellanox Technologies [15b3]"
# "Mellanox Technologies ConnectX-3 Pro Stand-up dual-port 40GbE MCX314A-BCCT [0006]"
PATTERN_DEVICE_ID = re.compile(r"\[(?P<id>[^\]]{4})\]")

# lspci -n
# 19e3:00:00.0 0108: 1414:b111 (rev 01)
# 2b5c:00:00.0 0108: 1414:b111 (rev 01)
# d2e9:00:00.0 0108: 1414:00a9
# d3f4:00:02.0 0200: 15b3:101a (rev 80)
PATTERN_PCI_DEVICE_ID = re.compile(
r"^(?P<slot>[^\s]+)\s+(?P<controller_id>[0-9a-fA-F]{4}):\s+"
r"(?P<vendor_id>[0-9a-fA-F]{4}):(?P<device_id>[0-9a-fA-F]{4})",
re.MULTILINE,
)

DEVICE_TYPE_DICT: Dict[str, List[str]] = {
constants.DEVICE_TYPE_SRIOV: ["Ethernet controller"],
Expand All @@ -60,6 +57,51 @@
constants.DEVICE_TYPE_GPU: ["NVIDIA Corporation"],
}

DEVICE_ID_DICT: Dict[str, List[str]] = {
constants.DEVICE_TYPE_SRIOV: [
"1004", # Mellanox Technologies MT27500/MT27520 Family [ConnectX-3/ConnectX-3 Pro Virtual Function] # noqa: E501
"1016", # Mellanox Technologies MT27710 Family [ConnectX-4 Lx Virtual Function]
"101a", # Mellanox Technologies MT28800 Family [ConnectX-5 Ex Virtual Function]
"101e", # Mellanox Technologies [ConnectX Family mlx5Gen Virtual Function]
],
constants.DEVICE_TYPE_NVME: [
"b111" # Microsoft Corporation Device, Local NVMe discs
],
constants.DEVICE_TYPE_ASAP: [
"00a9" # Remote discs connected using NVMe disc controller
],
constants.DEVICE_TYPE_GPU: [
"1db4", # NVIDIA Corporation GV100GL [Tesla V100 PCIe 16GB]
"1eb8", # NVIDIA Corporation TU104GL [Tesla T4]
"13f2", # NVIDIA Corporation GM204GL [Tesla M60]
"74b5", # Advanced Micro Devices, Inc. [AMD/ATI]
],
}

VENDOR_ID_DICT: Dict[str, List[str]] = {
constants.DEVICE_TYPE_SRIOV: [
"1414", # Microsoft Corporation
"15b3", # Mellanox Technologies
],
constants.DEVICE_TYPE_NVME: ["1414"], # Microsoft Corporation
constants.DEVICE_TYPE_GPU: ["10de"], # NVIDIA Corporation
constants.DEVICE_TYPE_AMD_GPU: ["1002"], # Advanced Micro Devices, Inc. [AMD/ATI]
}

CONTROLLER_ID_DICT: Dict[str, List[str]] = {
constants.DEVICE_TYPE_SRIOV: [
"0200", # Ethernet controller
"0207", # Infiniband controller
],
constants.DEVICE_TYPE_NVME: [
"0108", # Non-Volatile memory controller
],
constants.DEVICE_TYPE_GPU: [
"0302", # VGA compatible controller
"1200", # Processing accelerators (AMD GPU)
],
}

# Kernel driver in use: mlx4_core
# Kernel driver in use: mlx5_core
# Kernel driver in use: mlx4_core\r
Expand All @@ -78,43 +120,25 @@ def __str__(self) -> str:
f"vendor: {self.vendor}, "
f"info: {self.device_info}, "
f"vendor_id: {self.vendor_id}, "
f"device_id: {self.device_id}"
f"device_id: {self.device_id}, "
f"controller_id: {self.controller_id} "
)

def parse(self, raw_str: str) -> None:
matched_pci_device_info_list = find_groups_in_lines(
lines=raw_str,
pattern=PATTERN_PCI_DEVICE,
)
if matched_pci_device_info_list:
matched_pci_device_info = matched_pci_device_info_list[0]
self.slot = matched_pci_device_info.get("slot", "").strip()
matched_pci_device_info = PATTERN_PCI_DEVICE.match(raw_str)
if matched_pci_device_info:
self.slot = matched_pci_device_info.group("slot")
assert self.slot, f"Can not find slot info for: {raw_str}"

device_class = matched_pci_device_info.get("device_class", "")
assert device_class, f"Can not find device class for: {raw_str}"
self.device_class = PATTERN_DEVICE_ID.sub("", device_class).strip()

vendor = matched_pci_device_info.get("vendor", "")
assert vendor, f"Can not find vendor info for: {raw_str}"
vendor_id_raw = find_group_in_lines(
lines=vendor,
pattern=PATTERN_DEVICE_ID,
single_line=False,
)
self.vendor_id = vendor_id_raw.get("id", "")
assert self.vendor_id, f"cannot find vendor id from {raw_str}"
self.vendor = PATTERN_DEVICE_ID.sub("", vendor).strip()

self.device_info = matched_pci_device_info.get("device", "")
self.device_class = matched_pci_device_info.group("device_class")
assert self.device_class, f"Can not find device class for: {raw_str}"
self.vendor = matched_pci_device_info.group("vendor")
assert self.vendor, f"Can not find vendor info for: {raw_str}"
self.device_info = matched_pci_device_info.group("device")
assert self.device_info, f"Can not find device info for: {raw_str}"
device_id_raw = find_group_in_lines(
lines=self.device_info,
pattern=PATTERN_DEVICE_ID,
single_line=False,
)
self.device_id = device_id_raw.get("id", "")
assert self.device_id, f"cannot find device id from {raw_str}"
# Initialize the device_id, vendor_id and controller_id to None
self.vendor_id = ""
self.device_id = ""
self.controller_id = ""
else:
raise LisaException("cannot find any matched pci devices")

Expand Down Expand Up @@ -146,9 +170,12 @@ def get_device_names_by_type(
) -> List[str]:
if device_type.upper() not in DEVICE_TYPE_DICT.keys():
raise LisaException(f"pci_type '{device_type}' is not recognized.")
class_names = DEVICE_TYPE_DICT[device_type.upper()]
devices_list = self.get_devices(force_run)
devices_slots = [x.slot for x in devices_list if x.device_class in class_names]
devices_slots = []

for device in devices_list:
if device.device_id in DEVICE_ID_DICT[device_type.upper()]:
devices_slots.append(device.slot)
return devices_slots

def get_devices_by_type(
Expand All @@ -158,18 +185,52 @@ def get_devices_by_type(
raise LisaException(
f"pci_type '{device_type}' is not supported to be searched."
)
class_names = DEVICE_TYPE_DICT[device_type.upper()]
devices_list = self.get_devices(force_run)
device_type_list = [x for x in devices_list if x.device_class in class_names]
device_type_list = []
for device in devices_list:
if device.device_id in DEVICE_ID_DICT[device_type.upper()]:
device_type_list.append(device)

return device_type_list

@retry(KeyError, tries=10, delay=10)
def get_devices(self, force_run: bool = False) -> List[PciDevice]:
if (not self._pci_devices) or force_run:
self._pci_devices = []
self._pci_ids = {}
# Ensure pci device ids and name mappings are updated.
self.node.execute("update-pciids", sudo=True, shell=True)

# Fetching the id information using 'lspci -nnm' is not reliable
# due to inconsistencies in device id patterns.
# Example output of 'lspci -nnm':
# d2e9:00:00.0 "Non-Volatile memory controller [0108]" "Microsoft Corporation [1414]" "Device [00a9]" -p02 "Microsoft Corporation [1414]" "Device [0000]" # noqa: E501
# d3f4:00:02.0 "Ethernet controller [0200]" "Mellanox Technologies [15b3]" "MT28800 Family [ConnectX-5 Ex Virtual Function] [101a]" -r80 "Mellanox Technologies [15b3]" "MT28800 Family [ConnectX-5 Ex Virtual Function] [0127]" # noqa: E501
# Sample 'lspci -n' output for above devices:
# d2e9:00:00.0 0108: 1414:00a9
# d3f4:00:02.0 0200: 15b3:101a (rev 80)
# Fetch pci ids using 'lspci -n':
result = self.run(
"-Dmnn",
"-n",
force_run=force_run,
shell=True,
expected_exit_code=0,
sudo=True,
)
for pci_raw in result.stdout.splitlines():
pci_device_id_info = {}
matched_pci_device_info = PATTERN_PCI_DEVICE_ID.match(pci_raw)
if matched_pci_device_info:
pci_device_id_info[matched_pci_device_info.group("slot")] = {
"device_id": matched_pci_device_info.group("device_id"),
"vendor_id": matched_pci_device_info.group("vendor_id"),
"controller_id": matched_pci_device_info.group("controller_id"),
}
self._pci_ids.update(pci_device_id_info)

# Fetching the device information using 'lspci -m':
result = self.run(
"-m",
force_run=force_run,
shell=True,
expected_exit_code=0,
Expand All @@ -179,6 +240,21 @@ def get_devices(self, force_run: bool = False) -> List[PciDevice]:
pci_device = PciDevice(pci_raw)
self._pci_devices.append(pci_device)

for i in range(len(self._pci_devices)):
pci_slot_id = self._pci_devices[i].slot
# Sometimes the list of devices is not same from above 'lspci -n' output
# and 'lspci -m' outputs.
# It usually happens when the VM is just finished booting and not
# all PCI devices are detected. For example SRIOV devices.
# In such cases we need to retry after a short delay.
if pci_slot_id not in self._pci_ids:
raise KeyError(f"cannot find device id from {pci_slot_id}")
self._pci_devices[i].device_id = self._pci_ids[pci_slot_id]["device_id"]
self._pci_devices[i].vendor_id = self._pci_ids[pci_slot_id]["vendor_id"]
self._pci_devices[i].controller_id = self._pci_ids[pci_slot_id][
"controller_id"
]

return self._pci_devices

def disable_devices_by_type(self, device_type: str) -> int:
Expand Down
2 changes: 2 additions & 0 deletions lisa/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@
DEVICE_TYPE_SRIOV = "SRIOV"
DEVICE_TYPE_NVME = "NVME"
DEVICE_TYPE_GPU = "GPU"
DEVICE_TYPE_AMD_GPU = "AMD_GPU"
DEVICE_TYPE_ASAP = "ASAP"

DISK_PERFORMANCE_TOOL_FIO = "fio"
NETWORK_PERFORMANCE_TOOL_NTTTCP = "ntttcp"
Expand Down

0 comments on commit 1264464

Please sign in to comment.