diff --git a/qubes/tests/integ/dispvm.py b/qubes/tests/integ/dispvm.py index 891bba608..b9f6e63b6 100644 --- a/qubes/tests/integ/dispvm.py +++ b/qubes/tests/integ/dispvm.py @@ -314,6 +314,9 @@ def cleanup_preload(self): ) logger.info("deleting global feature") del self.app.domains["dom0"].features["preload-dispvm-max"] + if "preload-dispvm-threshold" in self.app.domains["dom0"].features: + logger.info("deleting global threshold feature") + del self.app.domains["dom0"].features["preload-dispvm-threshold"] logger.info("end") async def no_preload(self): @@ -327,9 +330,11 @@ def log_preload(self): global_preload_max = None if default_dispvm: global_preload_max = default_dispvm.get_feat_global_preload_max() + threshold = self.adminvm.features.get("preload-dispvm-threshold", None) preload_dict["global"] = { "name": default_dispvm.name if default_dispvm else None, "max": global_preload_max, + "threshold": threshold, } for qube in [self.disp_base, self.disp_base_alt]: preload = qube.get_feat_preload() @@ -739,6 +744,10 @@ async def _test_019_preload_refresh(self): await self.wait_preload(preload_max) old_preload = self.disp_base.get_feat_preload() await qube.start() + # If services are still starting, it may delay shutdown longer than + # the default timeout. Because we can't just kill default + # templates, wait gracefully for system services to have started. + await qube.run_service_for_stdio("qubes.WaitForRunningSystem") logger.info("shutdown '%s'", qube.name) await qube.shutdown(wait=True) await self.wait_preload(preload_max) diff --git a/qubes/tests/integ/dispvm_perf.py b/qubes/tests/integ/dispvm_perf.py index 083f50b83..54825ad95 100644 --- a/qubes/tests/integ/dispvm_perf.py +++ b/qubes/tests/integ/dispvm_perf.py @@ -73,13 +73,6 @@ def tearDown(self: qubes.tests.SystemTestCase): self.vm2.shutdown(), ) ) - if not os.getenv("QUBES_TEST_SKIP_TEARDOWN_SLEEP"): - # Avoid previous test load interfering with new test. - if self._testMethodName.startswith("vm"): - delay = 5 - else: - delay = 15 - time.sleep(delay) def run_test(self, name): dvm = self.dvm.name diff --git a/tests/dispvm_perf.py b/tests/dispvm_perf.py index f62b20068..9aed8e731 100755 --- a/tests/dispvm_perf.py +++ b/tests/dispvm_perf.py @@ -30,13 +30,23 @@ import asyncio import concurrent.futures import dataclasses +import logging import os import subprocess import time - import qubesadmin +# nose will duplicate this logger. +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s: %(levelname)s: %(funcName)s: %(message)s" +) +handler.setFormatter(formatter) +logger.addHandler(handler) + @dataclasses.dataclass class TestConfig: @@ -61,7 +71,7 @@ class TestConfig: via wrappers such as qvm-run. - vm-dispvm: - qrexec-client-vm: scripts - - *-vm: Fast way to test if the tests itself are working. + - *-vm: Fast way to test if the tests are working. - GUI VS non-GUI: - GUI tests workflows that uses disposables to open untrusted files or @@ -81,7 +91,7 @@ class TestConfig: the reverse is not true. Normal disposables are a control group for preloaded disposables. - Preloading more than 2 is not useful on sequential calls as long as - long as on the next call, there is a preload that has been completed. + on the next call, there is a preload that has been completed. """ name: str @@ -170,10 +180,17 @@ class TestConfig: preload_max=MAX_CONCURRENCY, ), TestConfig("dispvm-api", admin_api=True), - TestConfig("dispvm-concurrent-api", concurrent=True, admin_api=True), + TestConfig( + "dispvm-concurrent-api", + concurrent=True, + admin_api=True, + ), TestConfig("dispvm-gui-api", gui=True, admin_api=True), TestConfig( - "dispvm-gui-concurrent-api", gui=True, concurrent=True, admin_api=True + "dispvm-gui-concurrent-api", + gui=True, + concurrent=True, + admin_api=True, ), TestConfig( "dispvm-preload-more-api", @@ -189,7 +206,7 @@ class TestConfig: TestConfig( "dispvm-preload-concurrent-api", concurrent=True, - preload_max=MAX_PRELOAD, + preload_max=MAX_CONCURRENCY, admin_api=True, ), TestConfig( @@ -202,12 +219,18 @@ class TestConfig: "dispvm-preload-gui-concurrent-api", gui=True, concurrent=True, - preload_max=MAX_PRELOAD, + preload_max=MAX_CONCURRENCY, admin_api=True, ), ] +def get_load() -> str: + with open("/proc/loadavg", "r", encoding="ascii") as file: + load = file.read() + return load.rstrip() + + def get_time(): return time.clock_gettime(time.CLOCK_MONOTONIC) @@ -218,6 +241,8 @@ def __init__(self, dom0, dvm, vm1, vm2): self.dvm = dvm self.vm1 = vm1 self.vm2 = vm2 + self.app = self.dom0.app + self.adminvm = self.dom0 self.iterations = ITERATIONS async def wait_preload( @@ -229,6 +254,7 @@ async def wait_preload( timeout=60, ): """Waiting for completion avoids coroutine objects leaking.""" + logger.info("preload_max: '%s'", preload_max) if not appvm: appvm = self.dvm for _ in range(timeout): @@ -241,14 +267,15 @@ async def wait_preload( if fail_on_timeout: raise Exception("didn't preload in time") if not wait_completion: + logger.info("end") return preload_dispvm = appvm.features.get("preload-dispvm", "") preload_dispvm = preload_dispvm.split(" ") or [] preload_unfinished = preload_dispvm for _ in range(timeout): for qube in preload_unfinished.copy(): - self.dom0.app.domains.refresh_cache(force=True) - qube = self.dom0.app.domains[qube] + self.app.domains.refresh_cache(force=True) + qube = self.app.domains[qube] completed = qube.features.get("preload-dispvm-completed") if completed: preload_unfinished.remove(qube) @@ -259,12 +286,14 @@ async def wait_preload( else: if fail_on_timeout: raise Exception("last preloaded didn't complete in time") + logger.info("end") def wait_for_dispvm_destroy(self, dispvm_names): + logger.info("Waiting for destruction of disposables: %s", dispvm_names) timeout = 60 while True: - self.dom0.app.domains.refresh_cache(force=True) - if set(dispvm_names).isdisjoint(self.dom0.app.domains): + self.app.domains.refresh_cache(force=True) + if set(dispvm_names).isdisjoint(self.app.domains): break time.sleep(1) timeout -= 1 @@ -303,7 +332,8 @@ def run_latency_calls(self, test): "set -eu --; " f'max_concurrency="{MAX_CONCURRENCY}"; ' f"for i in $(seq {self.iterations}); do " - f" out=$({cmd}) {term}" + ' echo "$i"; ' + f" {cmd} {term}" ' pid="${!-}"; ' ' if test -n "${pid}"; then ' ' set -- "${@}" "${pid}"; ' @@ -318,7 +348,13 @@ def run_latency_calls(self, test): start_time = get_time() try: if test.from_dom0: - subprocess.run(code, shell=True, check=True, timeout=timeout) + subprocess.run( + code, + shell=True, + check=True, + capture_output=True, + timeout=timeout, + ) else: self.vm1.run(code, timeout=timeout) except subprocess.CalledProcessError as e: @@ -327,8 +363,12 @@ def run_latency_calls(self, test): f" {e.stdout}," f" {e.stderr}" ) - except subprocess.TimeoutExpired: - raise Exception(f"service '{cmd}' failed: timeout expired") + except subprocess.TimeoutExpired as e: + raise Exception( + f"service '{cmd}' failed: timeout expired:" + f" {e.stdout}," + f" {e.stderr}" + ) end_time = get_time() return round(end_time - start_time, ROUND_PRECISION) @@ -336,23 +376,33 @@ def call_api(self, test, service, qube): start_time = get_time() app = qubesadmin.Qubes() domains = app.domains - appvm = domains[qube] - domain_time = get_time() if test.non_dispvm: - target_qube = self.vm1 + # Even though we already have the qube object passed from the + # class, assume we don't so we can calculate gathering. + target_qube = domains[self.vm1.name] + domain_time = get_time() else: + appvm = domains[qube] + domain_time = get_time() target_qube = qubesadmin.vm.DispVM.from_appvm(app, appvm) + name = target_qube.name + # A very small number, if it appears, it will show a bottleneck at + # DispVM.from_appvm. target_time = get_time() try: target_qube.run_service_for_stdio(service, timeout=60) except subprocess.CalledProcessError as e: raise Exception( - f"service '{service}' failed ({e.returncode}):" + f"'{name}': service '{service}' failed ({e.returncode}):" + f" {e.stdout}," + f" {e.stderr}" + ) + except subprocess.TimeoutExpired as e: + raise Exception( + f"'{name}': service '{service}' failed: timeout expired:" f" {e.stdout}," f" {e.stderr}" ) - except subprocess.TimeoutExpired: - raise Exception(f"service '{service}' failed: timeout expired") run_service_time = get_time() if not test.non_dispvm: target_qube.cleanup() @@ -505,35 +555,81 @@ def run_test(self, test: TestConfig): f"{gui_prefix} {target} allow\n" f"{nogui_prefix} {target} allow\n" ) - if test.preload_max: - orig_preload_max = self.dom0.features.get("preload-dispvm-max") - if orig_preload_max is not None: - del self.dom0.features["preload-dispvm-max"] + orig_preload_threshold = self.dom0.features.get( + "preload-dispvm-threshold" + ) + orig_preload_max = self.dom0.features.get("preload-dispvm-max") + if orig_preload_threshold is not None: + logger.info("Deleting threshold feature") + del self.dom0.features["preload-dispvm-threshold"] + if orig_preload_max is not None: + logger.info("Deleting global max feature") + del self.dom0.features["preload-dispvm-max"] try: if test.preload_max: preload_max = test.preload_max + logger.info("Setting local max feature: '%s'", preload_max) self.dvm.features["preload-dispvm-max"] = str(preload_max) asyncio.run(self.wait_preload(preload_max)) + for qube in [self.vm1, self.vm2]: + if not qube: + # Might be an empty string. + continue + logger.info( + "Waiting for VM '%s' to finish startup", + qube.name, + ) + qube.run_service_for_stdio("qubes.WaitForSession", timeout=60) + logger.info("Load before test: '%s'", get_load()) if test.admin_api: result = self.run_latency_api_calls(test) else: result = self.run_latency_calls(test) self.report_result(test, result) + except: + logger.error("Failed to run test: '%s'", test.name) + raise finally: if test.preload_max: old_preload_max = int( self.dvm.features.get("preload-dispvm-max", 0) or 0 ) + logger.info( + "Waiting to preload the old test setting: '%s'", + old_preload_max, + ) asyncio.run(self.wait_preload(old_preload_max)) old_preload = self.dvm.features.get("preload-dispvm", "") old_preload = old_preload.split(" ") or [] + logger.info("Deleting local max feature") del self.dvm.features["preload-dispvm-max"] self.wait_for_dispvm_destroy(old_preload) - if orig_preload_max is not None: - self.dom0.features["preload-dispvm-max"] = orig_preload_max - if orig_preload_max != 0: - asyncio.run(self.wait_preload(orig_preload_max)) + if orig_preload_threshold is not None: + logger.info( + "Setting the original threshold feature: '%s'", + orig_preload_threshold, + ) + self.dom0.features["preload-dispvm-threshold"] = ( + orig_preload_threshold + ) + if orig_preload_max is not None: + logger.info( + "Setting the global max feature: '%s'", orig_preload_max + ) + self.dom0.features["preload-dispvm-max"] = orig_preload_max os.unlink(POLICY_FILE) + if not os.getenv("QUBES_TEST_SKIP_TEARDOWN_SLEEP"): + logger.info("Load before sleep: '%s'", get_load()) + delay = 5 + if not test.non_dispvm: + delay += 10 + if test.gui: + delay += 2 + if test.concurrent: + delay += 8 + logger.info("Sleeping for '%d' seconds", delay) + time.sleep(delay) + logger.info("Load after sleep: '%s'", get_load()) def main(): @@ -567,7 +663,8 @@ def main(): if args.iterations: run.iterations = args.iterations - for index, test in enumerate(tests): + for test in tests: + logger.info("Running test '%s'", test.name) run.run_test(test)