QubesOS · marmarek · Aug 26, 2025 · Aug 7, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/qubes/tests/integ/dispvm.py b/qubes/tests/integ/dispvm.py
@@ -314,6 +314,9 @@ def cleanup_preload(self):
                 )
             logger.info("deleting global feature")
             del self.app.domains["dom0"].features["preload-dispvm-max"]
+        if "preload-dispvm-threshold" in self.app.domains["dom0"].features:
+            logger.info("deleting global threshold feature")
+            del self.app.domains["dom0"].features["preload-dispvm-threshold"]
         logger.info("end")
 
     async def no_preload(self):
@@ -327,9 +330,11 @@ def log_preload(self):
         global_preload_max = None
         if default_dispvm:
             global_preload_max = default_dispvm.get_feat_global_preload_max()
+        threshold = self.adminvm.features.get("preload-dispvm-threshold", None)
         preload_dict["global"] = {
             "name": default_dispvm.name if default_dispvm else None,
             "max": global_preload_max,
+            "threshold": threshold,
         }
         for qube in [self.disp_base, self.disp_base_alt]:
             preload = qube.get_feat_preload()
@@ -739,6 +744,10 @@ async def _test_019_preload_refresh(self):
             await self.wait_preload(preload_max)
             old_preload = self.disp_base.get_feat_preload()
             await qube.start()
+            # If services are still starting, it may delay shutdown longer than
+            # the default timeout. Because we can't just kill default
+            # templates, wait gracefully for system services to have started.
+            await qube.run_service_for_stdio("qubes.WaitForRunningSystem")
             logger.info("shutdown '%s'", qube.name)
             await qube.shutdown(wait=True)
             await self.wait_preload(preload_max)

diff --git a/qubes/tests/integ/dispvm_perf.py b/qubes/tests/integ/dispvm_perf.py
@@ -73,13 +73,6 @@ def tearDown(self: qubes.tests.SystemTestCase):
                     self.vm2.shutdown(),
                 )
             )
-        if not os.getenv("QUBES_TEST_SKIP_TEARDOWN_SLEEP"):
-            # Avoid previous test load interfering with new test.
-            if self._testMethodName.startswith("vm"):
-                delay = 5
-            else:
-                delay = 15
-            time.sleep(delay)
 
     def run_test(self, name):
         dvm = self.dvm.name

diff --git a/tests/dispvm_perf.py b/tests/dispvm_perf.py
@@ -30,13 +30,23 @@
 import asyncio
 import concurrent.futures
 import dataclasses
+import logging
 import os
 import subprocess
 import time
 
-
 import qubesadmin
 
+# nose will duplicate this logger.
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+    "%(asctime)s: %(levelname)s: %(funcName)s: %(message)s"
+)
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
 
 @dataclasses.dataclass
 class TestConfig:
@@ -61,7 +71,7 @@ class TestConfig:
               via wrappers such as qvm-run.
         - vm-dispvm:
             - qrexec-client-vm: scripts
-        - *-vm: Fast way to test if the tests itself are working.
+        - *-vm: Fast way to test if the tests are working.
             -
     GUI VS non-GUI:
         - GUI tests workflows that uses disposables to open untrusted files or
@@ -81,7 +91,7 @@ class TestConfig:
           the reverse is not true. Normal disposables are a control group for
           preloaded disposables.
         - Preloading more than 2 is not useful on sequential calls as long as
-          long as on the next call, there is a preload that has been completed.
+          on the next call, there is a preload that has been completed.
     """
 
     name: str
@@ -170,10 +180,17 @@ class TestConfig:
         preload_max=MAX_CONCURRENCY,
     ),
     TestConfig("dispvm-api", admin_api=True),
-    TestConfig("dispvm-concurrent-api", concurrent=True, admin_api=True),
+    TestConfig(
+        "dispvm-concurrent-api",
+        concurrent=True,
+        admin_api=True,
+    ),
     TestConfig("dispvm-gui-api", gui=True, admin_api=True),
     TestConfig(
-        "dispvm-gui-concurrent-api", gui=True, concurrent=True, admin_api=True
+        "dispvm-gui-concurrent-api",
+        gui=True,
+        concurrent=True,
+        admin_api=True,
     ),
     TestConfig(
         "dispvm-preload-more-api",
@@ -189,7 +206,7 @@ class TestConfig:
     TestConfig(
         "dispvm-preload-concurrent-api",
         concurrent=True,
-        preload_max=MAX_PRELOAD,
+        preload_max=MAX_CONCURRENCY,
         admin_api=True,
     ),
     TestConfig(
@@ -202,12 +219,18 @@ class TestConfig:
         "dispvm-preload-gui-concurrent-api",
         gui=True,
         concurrent=True,
-        preload_max=MAX_PRELOAD,
+        preload_max=MAX_CONCURRENCY,
         admin_api=True,
     ),
 ]
 
 
+def get_load() -> str:
+    with open("/proc/loadavg", "r", encoding="ascii") as file:
+        load = file.read()
+    return load.rstrip()
+
+
 def get_time():
     return time.clock_gettime(time.CLOCK_MONOTONIC)
 
@@ -218,6 +241,8 @@ def __init__(self, dom0, dvm, vm1, vm2):
         self.dvm = dvm
         self.vm1 = vm1
         self.vm2 = vm2
+        self.app = self.dom0.app
+        self.adminvm = self.dom0
         self.iterations = ITERATIONS
 
     async def wait_preload(
@@ -229,6 +254,7 @@ async def wait_preload(
         timeout=60,
     ):
         """Waiting for completion avoids coroutine objects leaking."""
+        logger.info("preload_max: '%s'", preload_max)
         if not appvm:
             appvm = self.dvm
         for _ in range(timeout):
@@ -241,14 +267,15 @@ async def wait_preload(
             if fail_on_timeout:
                 raise Exception("didn't preload in time")
         if not wait_completion:
+            logger.info("end")
             return
         preload_dispvm = appvm.features.get("preload-dispvm", "")
         preload_dispvm = preload_dispvm.split(" ") or []
         preload_unfinished = preload_dispvm
         for _ in range(timeout):
             for qube in preload_unfinished.copy():
-                self.dom0.app.domains.refresh_cache(force=True)
-                qube = self.dom0.app.domains[qube]
+                self.app.domains.refresh_cache(force=True)
+                qube = self.app.domains[qube]
                 completed = qube.features.get("preload-dispvm-completed")
                 if completed:
                     preload_unfinished.remove(qube)
@@ -259,12 +286,14 @@ async def wait_preload(
         else:
             if fail_on_timeout:
                 raise Exception("last preloaded didn't complete in time")
+        logger.info("end")
 
     def wait_for_dispvm_destroy(self, dispvm_names):
+        logger.info("Waiting for destruction of disposables: %s", dispvm_names)
         timeout = 60
         while True:
-            self.dom0.app.domains.refresh_cache(force=True)
-            if set(dispvm_names).isdisjoint(self.dom0.app.domains):
+            self.app.domains.refresh_cache(force=True)
+            if set(dispvm_names).isdisjoint(self.app.domains):
                 break
             time.sleep(1)
             timeout -= 1
@@ -303,7 +332,8 @@ def run_latency_calls(self, test):
             "set -eu --; "
             f'max_concurrency="{MAX_CONCURRENCY}"; '
             f"for i in $(seq {self.iterations}); do "
-            f"  out=$({cmd}) {term}"
+            '  echo "$i"; '
+            f"  {cmd} {term}"
             '  pid="${!-}"; '
             '  if test -n "${pid}"; then '
             '    set -- "${@}" "${pid}"; '
@@ -318,7 +348,13 @@ def run_latency_calls(self, test):
         start_time = get_time()
         try:
             if test.from_dom0:
-                subprocess.run(code, shell=True, check=True, timeout=timeout)
+                subprocess.run(
+                    code,
+                    shell=True,
+                    check=True,
+                    capture_output=True,
+                    timeout=timeout,
+                )
             else:
                 self.vm1.run(code, timeout=timeout)
         except subprocess.CalledProcessError as e:
@@ -327,32 +363,46 @@ def run_latency_calls(self, test):
                 f" {e.stdout},"
                 f" {e.stderr}"
             )
-        except subprocess.TimeoutExpired:
-            raise Exception(f"service '{cmd}' failed: timeout expired")
+        except subprocess.TimeoutExpired as e:
+            raise Exception(
+                f"service '{cmd}' failed: timeout expired:"
+                f" {e.stdout},"
+                f" {e.stderr}"
+            )
         end_time = get_time()
         return round(end_time - start_time, ROUND_PRECISION)
 
     def call_api(self, test, service, qube):
         start_time = get_time()
         app = qubesadmin.Qubes()
         domains = app.domains
-        appvm = domains[qube]
-        domain_time = get_time()
         if test.non_dispvm:
-            target_qube = self.vm1
+            # Even though we already have the qube object passed from the
+            # class, assume we don't so we can calculate gathering.
+            target_qube = domains[self.vm1.name]
+            domain_time = get_time()
         else:
+            appvm = domains[qube]
+            domain_time = get_time()
             target_qube = qubesadmin.vm.DispVM.from_appvm(app, appvm)
+        name = target_qube.name
+        # A very small number, if it appears, it will show a bottleneck at
+        # DispVM.from_appvm.
         target_time = get_time()
         try:
             target_qube.run_service_for_stdio(service, timeout=60)
         except subprocess.CalledProcessError as e:
             raise Exception(
-                f"service '{service}' failed ({e.returncode}):"
+                f"'{name}': service '{service}' failed ({e.returncode}):"
+                f" {e.stdout},"
+                f" {e.stderr}"
+            )
+        except subprocess.TimeoutExpired as e:
+            raise Exception(
+                f"'{name}': service '{service}' failed: timeout expired:"
                 f" {e.stdout},"
                 f" {e.stderr}"
             )
-        except subprocess.TimeoutExpired:
-            raise Exception(f"service '{service}' failed: timeout expired")
         run_service_time = get_time()
         if not test.non_dispvm:
             target_qube.cleanup()
@@ -505,35 +555,81 @@ def run_test(self, test: TestConfig):
                 f"{gui_prefix} {target} allow\n"
                 f"{nogui_prefix} {target} allow\n"
             )
-        if test.preload_max:
-            orig_preload_max = self.dom0.features.get("preload-dispvm-max")
-            if orig_preload_max is not None:
-                del self.dom0.features["preload-dispvm-max"]
+        orig_preload_threshold = self.dom0.features.get(
+            "preload-dispvm-threshold"
+        )
+        orig_preload_max = self.dom0.features.get("preload-dispvm-max")
+        if orig_preload_threshold is not None:
+            logger.info("Deleting threshold feature")
+            del self.dom0.features["preload-dispvm-threshold"]
+        if orig_preload_max is not None:
+            logger.info("Deleting global max feature")
+            del self.dom0.features["preload-dispvm-max"]
         try:
             if test.preload_max:
                 preload_max = test.preload_max
+                logger.info("Setting local max feature: '%s'", preload_max)
                 self.dvm.features["preload-dispvm-max"] = str(preload_max)
                 asyncio.run(self.wait_preload(preload_max))
+            for qube in [self.vm1, self.vm2]:
+                if not qube:
+                    # Might be an empty string.
+                    continue
+                logger.info(
+                    "Waiting for VM '%s' to finish startup",
+                    qube.name,
+                )
+                qube.run_service_for_stdio("qubes.WaitForSession", timeout=60)
+            logger.info("Load before test: '%s'", get_load())
             if test.admin_api:
                 result = self.run_latency_api_calls(test)
             else:
                 result = self.run_latency_calls(test)
             self.report_result(test, result)
+        except:
+            logger.error("Failed to run test: '%s'", test.name)
+            raise
         finally:
             if test.preload_max:
                 old_preload_max = int(
                     self.dvm.features.get("preload-dispvm-max", 0) or 0
                 )
+                logger.info(
+                    "Waiting to preload the old test setting: '%s'",
+                    old_preload_max,
+                )
                 asyncio.run(self.wait_preload(old_preload_max))
                 old_preload = self.dvm.features.get("preload-dispvm", "")
                 old_preload = old_preload.split(" ") or []
+                logger.info("Deleting local max feature")
                 del self.dvm.features["preload-dispvm-max"]
                 self.wait_for_dispvm_destroy(old_preload)
-                if orig_preload_max is not None:
-                    self.dom0.features["preload-dispvm-max"] = orig_preload_max
-                    if orig_preload_max != 0:
-                        asyncio.run(self.wait_preload(orig_preload_max))
+            if orig_preload_threshold is not None:
+                logger.info(
+                    "Setting the original threshold feature: '%s'",
+                    orig_preload_threshold,
+                )
+                self.dom0.features["preload-dispvm-threshold"] = (
+                    orig_preload_threshold
+                )
+            if orig_preload_max is not None:
+                logger.info(
+                    "Setting the global max feature: '%s'", orig_preload_max
+                )
+                self.dom0.features["preload-dispvm-max"] = orig_preload_max
             os.unlink(POLICY_FILE)
+            if not os.getenv("QUBES_TEST_SKIP_TEARDOWN_SLEEP"):
+                logger.info("Load before sleep: '%s'", get_load())
+                delay = 5
+                if not test.non_dispvm:
+                    delay += 10
+                    if test.gui:
+                        delay += 2
+                    if test.concurrent:
+                        delay += 8
+                logger.info("Sleeping for '%d' seconds", delay)
+                time.sleep(delay)
+                logger.info("Load after sleep: '%s'", get_load())
 
 
 def main():
@@ -567,7 +663,8 @@ def main():
     if args.iterations:
         run.iterations = args.iterations
 
-    for index, test in enumerate(tests):
+    for test in tests:
+        logger.info("Running test '%s'", test.name)
         run.run_test(test)