Add proxy

timodonnell · timodonnell · commit e91c1acfbdc4 · 2023-05-15T11:35:34.000-04:00
diff --git a/api.py b/api.py
@@ -91,7 +91,12 @@ def get_model(self, tool_name, args):
         return model
 
     def get(self, tool_name):
-        return str(self.MODEL_CACHE.keys())
+        result = {
+            'description': 'runner',
+            'max_parallelism': 1,
+            'model_cache_keys': list(self.MODEL_CACHE.keys()),
+        }
+        return result, 200
 
     def post(self, tool_name):
         tool_class = TOOLS[tool_name]
diff --git a/proteopt/client.py b/proteopt/client.py
@@ -14,15 +14,26 @@ def __init__(self, endpoints, max_retries=2):
         self.endpoints = endpoints
         self.work_queue = Queue()
         self.max_retries = max_retries
-
         self.threads = []
+
         for endpoint in endpoints:
-            thread = threading.Thread(
-                target=self.worker_thread,
-                name="thread_%s" % endpoint,
-                daemon=True,
-                args=(endpoint,))
-            thread.start()
+            session = requests.Session()
+            full_endpoint = endpoint + "/info"
+            info = session.get(full_endpoint)
+            if info.status_code != 200:
+                raise IOError(f"Couldn't get info for {full_endpoint}: {info.status_code} {info.text}")
+            max_parallelism = info.json()['max_parallelism']
+            print(f"Client: endpoint {endpoint} will use max_parallelism {max_parallelism}")
+            for i in range(max_parallelism):
+                thread = threading.Thread(
+                    target=self.worker_thread,
+                    name=f"thread_{i}_{endpoint}",
+                    daemon=True,
+                    args=(endpoint,))
+                self.threads.append(thread)
+                thread.start()
+
+        self.max_parallelism = max(1, len(self.threads))
 
     def shutdown(self):
         work_queue = self.work_queue
diff --git a/proxy.py b/proxy.py
@@ -0,0 +1,218 @@
+import argparse
+import queue
+import subprocess
+import signal
+import os
+import sys
+import glob
+import logging
+import socket
+import time
+import tempfile
+
+from flask import Flask, request
+from flask_restful import reqparse, abort, Api, Resource, inputs
+
+import proteopt.client
+
+from proteopt.common import serialize, deserialize
+
+app = Flask(__name__)
+api = Api(app)
+
+
+class Proxy(Resource):
+    endpoints = set()
+    max_retries = None
+    client = None
+
+    @classmethod
+    def get_client(cls):
+        if cls.client is None:
+            if not cls.endpoints:
+                raise ValueError("No endpoints")
+            cls.client = proteopt.client.Client(
+                endpoints=[e + "/tool" for e in cls.endpoints],
+                max_retries=cls.max_retries)
+        return cls.client
+
+    def get(self, action, name):
+        if action == "add-endpoint":
+            endpoint = request.args.get('endpoint')
+            self.endpoints.add(endpoint)
+            return f"Added endpoint {endpoint}"
+        elif action == "remove-endpoint":
+            endpoint = request.args.get('endpoint')
+            if endpoint in self.endpoints:
+                self.endpoints.remove(endpoint)
+                return f"Removed endpoint {endpoint}"
+            else:
+                return f"No such endpoint {endpoint}"
+        elif action == "status":
+            lines = []
+            lines.extend(sorted(self.endpoints))
+            return "\n".join(lines)
+        elif action == "clear":
+            self.endpoints.clear()
+            return "Cleared endpoints"
+        return str(self.MODEL_CACHE.keys())
+
+class Tool(Resource):
+    def get(self, tool_name):
+        try:
+            max_parallelism = Proxy.get_client().max_parallelism
+        except Exception as e:
+            logging.warning("Couldn't get parallelism: %s", e)
+            max_parallelism = 8
+        result = {
+            'description': 'proxy',
+            'endpoints': sorted(Proxy.endpoints),
+            'max_parallelism': max_parallelism,
+        }
+        return result, 200
+
+    def post(self, tool_name):
+        payload = request.get_json()
+        payload['tool_name'] = tool_name
+
+        client = Proxy.get_client()
+        result_queue = queue.Queue()
+        client.work_queue.put((0, payload, result_queue))
+        (payload_id, return_payload) = result_queue.get()
+        assert payload_id == 0
+        return return_payload, 200
+
+
+api.add_resource(Proxy, '/proxy/<action>')
+api.add_resource(Tool, '/tool/<tool_name>')
+
+
+# Run the test server
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument("--no-cleanup", action="store_true", default=False)
+arg_parser.add_argument("--max-retries", default=2, type=int)
+arg_parser.add_argument("--endpoints", nargs="+")
+arg_parser.add_argument("--host", default="127.0.0.1")
+arg_parser.add_argument("--port", type=int)
+arg_parser.add_argument("--write-endpoint-to-file")
+arg_parser.add_argument(
+    "--debug",
+    default=False,
+    action="store_true")
+
+arg_parser.add_argument(
+    "--launch-servers",
+    metavar="N",
+    type=int,
+    help="Launch N API servers. If N=-1, then one server is launched per GPU and "
+    "the CUDA_VISIBLE_DEVICES parameter is set accordingly for each server.")
+arg_parser.add_argument(
+    "--launch-args",
+    nargs=argparse.REMAINDER,
+    help="All following args are args for launched API servers.")
+
+if __name__ == '__main__':
+    args = arg_parser.parse_args(sys.argv[1:])
+    logging.basicConfig(level=logging.INFO)
+
+    endpoint_to_process = {}
+    work_dir = None
+    if args.launch_servers:
+        print(args)
+        num_to_launch = args.launch_servers
+        set_cuda_visible_devices = False
+        if args.launch_servers == -1:
+            gpu_lines = subprocess.check_output(["nvidia-smi", "-L"]).decode().split("\n")
+            gpu_lines = [g.strip() for g in gpu_lines]
+            gpu_lines = [g for g in gpu_lines if g.startswith("GPU ")]
+            num_to_launch = len(gpu_lines)
+            print(f"Detected {num_to_launch} GPUs.")
+            set_cuda_visible_devices = True
+
+        work_dir = tempfile.TemporaryDirectory(prefix="proteopt_proxy_")
+        for i in range(num_to_launch):
+            endpoint_file = os.path.join(work_dir.name, f"endpoint.{i}.txt")
+            sub_args = [
+                "python",
+                os.path.join(os.path.dirname(__file__), "api.py"),
+            ]
+            sub_args.extend(args.launch_args)
+            sub_args.extend(["--write-endpoint-to-file", endpoint_file])
+            if set_cuda_visible_devices:
+                sub_args.extend(["--cuda-visible-devices", str(i)])
+            print(f"Launching API server {i} / {num_to_launch} with args:")
+            print(sub_args)
+
+            logfile = os.path.join(work_dir.name, f"log.{i}.txt")
+            logfile_fd = open(logfile, "w+b")
+            process = subprocess.Popen(
+                sub_args, stderr=logfile_fd, stdout=logfile_fd)
+            while process.poll() is None and not os.path.exists(endpoint_file):
+                time.sleep(0.1)
+            try:
+                endpoint = open(endpoint_file).read().strip()
+            except IOError:
+                print("Failed to load endpoint file. Process log:")
+                logfile_fd.seek(0)
+                for line in logfile_fd.readlines():
+                    print(line)
+                raise
+            print(f"API server {i} at endpoint {endpoint} will log to {logfile}")
+            endpoint_to_process[endpoint] = process
+        Proxy.endpoints.update(list(endpoint_to_process))
+
+    Proxy.max_retries = args.max_retries
+    if args.endpoints:
+        Proxy.endpoints.update(args.endpoints)
+
+    print("Initialized proxy with endpoints: ", Proxy.endpoints)
+
+    port = args.port
+    if not port:
+        # Identify an available port
+        # Based on https://stackoverflow.com/questions/5085656/how-to-select-random-port-number-in-flask
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.bind((args.host, 0))
+        port = sock.getsockname()[1]
+        sock.close()
+
+    endpoint = "http://%s:%d" % (args.host, port)
+    print("Endpoint will be", endpoint)
+    if args.write_endpoint_to_file:
+        with open(args.write_endpoint_to_file, "w") as fd:
+            fd.write(endpoint)
+            fd.write("\n")
+        print("Wrote", args.write_endpoint_to_file)
+
+    def cleanup(sig, frame):
+        import ipdb ; ipdb.set_trace()
+        if args.debug:
+            print("Dumping logs.")
+            for g in glob.glob(os.path.join(work_dir.name, "*.txt")):
+                print("*" * 40)
+                print(g)
+                print("*" * 40)
+                for line in open(g).readlines():
+                    print("---", line.rstrip())
+
+        if work_dir is not None and not args.no_cleanup:
+            print(f"Cleaning up {work_dir}")
+            work_dir.cleanup()
+
+        while endpoint_to_process:
+            endpoint, process = endpoint_to_process.popitem()
+            print(f"Terminating process with endpoint {endpoint}")
+            process.terminate()
+            if process.poll() is None:
+                process.kill()
+        print("Done.")
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, cleanup)
+
+    app.run(
+        host=args.host,
+        port=port,
+        debug=args.debug,
+        use_reloader=False,
+        threaded=True)
diff --git a/test/test_proxy.py b/test/test_proxy.py
@@ -0,0 +1,22 @@
+import warnings
+
+import numpy.testing
+
+warnings.filterwarnings("ignore")
+
+import proteopt
+import proteopt.client
+import proteopt.mock_tool
+
+from .util import running_proxy_endpoint
+
+
+def test_basic(running_proxy_endpoint):
+    client = proteopt.client.Client(endpoints=[running_proxy_endpoint])
+    model = client.remote_model(proteopt.mock_tool.MockTool, greeting="hi")
+    results = model.run_multiple([
+        dict(name="tim", sleep_time=0.0),
+        dict(name="joe", sleep_time=0.0, array=[1,2]),
+    ])
+    results = list(results)
+    assert results == ["test-server: hi tim", "test-server: hi joe 3.00"]
diff --git a/test/util.py b/test/util.py
@@ -34,6 +34,35 @@ def run_server(port, sleep_seconds=3.0):
     os.unlink(endpoint_file)
     return (process, endpoint + "/tool")
 
+
+def run_proxy(port):
+    endpoint_file = "/tmp/proteopt_endpoint.txt"
+    try:
+        os.unlink(endpoint_file)
+    except IOError:
+        pass
+    process = subprocess.Popen(
+        [
+            "python",
+            os.path.join(REPO_ROOT_DIR, "proxy.py"),
+            "--debug",
+            "--no-cleanup",
+            "--port", str(port),
+            "--launch-servers", "3",
+            "--write-endpoint-to-file", endpoint_file,
+            "--launch-args",
+            "--mock-server-name", 'test-server',
+            "--alphafold-data-dir", ALPHAFOLD_WEIGHTS_DIR,
+            "--omegafold-data-dir", OMEGAFOLD_WEIGHTS_DIR,
+            "--rfdiffusion-motif-models-dir", RFDIFFUSION_WEIGHTS_DIR,
+        ])
+    while process.poll() is None and not os.path.exists(endpoint_file):
+        time.sleep(0.1)
+    with open(endpoint_file) as fd:
+        endpoint = fd.read().strip()
+    os.unlink(endpoint_file)
+    return (process, endpoint + "/tool")
+
 @pytest.fixture
 def running_server_endpoint(port=0, sleep_seconds=0):
     (process, endpoint) = run_server(port)
@@ -54,4 +83,10 @@ def multiple_running_server_endpoints(ports=(0, 0), sleep_seconds=0):
         time.sleep(sleep_seconds)
     yield endpoints
     for process in processes:
-        process.terminate()
+        process.terminate()
+
+@pytest.fixture
+def running_proxy_endpoint(port=0):
+    (process, endpoint) = run_proxy(port)
+    yield endpoint
+    process.terminate()