Skip to content

Commit f36ca9d

Browse files
authored
Fix request filtering for service stats (#2678)
Instead of relying on status codes or other heuristics, have Nginx log a flag for each request forwarded, or intended to be forwarded, to a service replica. Use these flags to determine which requests to include in service stats. This reliably excludes requests that are terminated by Nginx and hence don't add load to replicas. These include HTTP to HTTPS redirects, unauthenticated requests, and requests that violate rate or body size limits.
1 parent f0ddae8 commit f36ca9d

File tree

4 files changed

+72
-25
lines changed

4 files changed

+72
-25
lines changed
Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,11 @@
1-
log_format dstack_stat '$time_iso8601 $host $status $request_time';
1+
log_format dstack_stat '$time_iso8601 $host $status $request_time $dstack_replica_hit';
2+
3+
4+
# A hack to avoid this Nginx reload error when no services are registered:
5+
# nginx: [emerg] unknown "dstack_replica_hit" variable
6+
server {
7+
listen unix:/tmp/dstack-dummy-nginx.sock;
8+
server_name placeholder.local;
9+
deny all;
10+
set $dstack_replica_hit 0;
11+
}

src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ upstream {{ domain }}.upstream {
1414
server {
1515
server_name {{ domain }};
1616
limit_req_status 429;
17+
set $dstack_replica_hit 0;
1718
access_log {{ access_log_path }} dstack_stat;
1819
client_max_body_size {{ client_max_body_size }};
1920

@@ -23,40 +24,45 @@ server {
2324
auth_request /_dstack_auth;
2425
{% endif %}
2526

26-
{% if replicas %}
2727
try_files /nonexistent @$http_upgrade;
28-
{% else %}
29-
return 503;
30-
{% endif %}
3128

3229
{% if location.limit_req %}
3330
limit_req zone={{ location.limit_req.zone }}{% if location.limit_req.burst %} burst={{ location.limit_req.burst }} nodelay{% endif %};
3431
{% endif %}
3532
}
3633
{% endfor %}
3734

38-
{% if replicas %}
3935
location @websocket {
36+
set $dstack_replica_hit 1;
37+
{% if replicas %}
4038
proxy_pass http://{{ domain }}.upstream;
4139
proxy_set_header X-Real-IP $remote_addr;
4240
proxy_set_header Host $host;
4341
proxy_http_version 1.1;
4442
proxy_set_header Upgrade $http_upgrade;
4543
proxy_set_header Connection "Upgrade";
4644
proxy_read_timeout 300s;
45+
{% else %}
46+
return 503;
47+
{% endif %}
4748
}
4849
location @ {
50+
set $dstack_replica_hit 1;
51+
{% if replicas %}
4952
proxy_pass http://{{ domain }}.upstream;
5053
proxy_set_header X-Real-IP $remote_addr;
5154
proxy_set_header Host $host;
5255
proxy_read_timeout 300s;
56+
{% else %}
57+
return 503;
58+
{% endif %}
5359
}
54-
{% endif %}
5560

5661
{% if auth %}
5762
location = /_dstack_auth {
5863
internal;
5964
if ($remote_addr = 127.0.0.1) {
65+
# for requests from the gateway app, e.g. from the OpenAI-compatible API
6066
return 200;
6167
}
6268
proxy_pass http://localhost:{{ proxy_port }}/api/auth/{{ project_name }};

src/dstack/_internal/proxy/gateway/services/stats.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111

1212
from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo
1313
from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, ServiceStats, Stat
14+
from dstack._internal.proxy.lib.errors import UnexpectedProxyError
1415
from dstack._internal.utils.common import run_async
1516

1617
logger = logging.getLogger(__name__)
17-
IGNORE_STATUSES = {403, 404}
1818
WINDOWS = (30, 60, 300)
1919
TTL = WINDOWS[-1]
2020
EMPTY_STATS = {window: Stat(requests=0, request_time=0.0) for window in WINDOWS}
@@ -35,6 +35,7 @@ class LogEntry(BaseModel):
3535
host: str
3636
status: int
3737
request_time: float
38+
is_replica_hit: bool
3839

3940

4041
class StatsCollector:
@@ -87,7 +88,8 @@ def _collect(self) -> None:
8788
now = datetime.datetime.now(tz=datetime.timezone.utc)
8889

8990
for entry in self._read_access_log(now - datetime.timedelta(seconds=TTL)):
90-
if entry.status in IGNORE_STATUSES:
91+
# only include requests that hit or should hit a service replica
92+
if not entry.is_replica_hit:
9193
continue
9294

9395
frame_timestamp = int(entry.timestamp.timestamp())
@@ -119,7 +121,10 @@ def _read_access_log(self, after: datetime.datetime) -> Iterable[LogEntry]:
119121
line = self._file.readline()
120122
if not line:
121123
break
122-
timestamp_str, host, status, request_time = line.split()
124+
cells = line.split()
125+
if len(cells) == 4: # compatibility with pre-0.19.11 logs
126+
cells.append("0" if cells[2] in ["403", "404"] else "1")
127+
timestamp_str, host, status, request_time, dstack_replica_hit = cells
123128
timestamp = datetime.datetime.fromisoformat(timestamp_str)
124129
if timestamp < after:
125130
continue
@@ -128,6 +133,7 @@ def _read_access_log(self, after: datetime.datetime) -> Iterable[LogEntry]:
128133
host=host,
129134
status=int(status),
130135
request_time=float(request_time),
136+
is_replica_hit=_parse_nginx_bool(dstack_replica_hit),
131137
)
132138
if os.fstat(self._file.fileno()).st_ino != st_ino:
133139
# file was rotated
@@ -154,3 +160,11 @@ async def get_service_stats(
154160
)
155161
for service in services
156162
]
163+
164+
165+
def _parse_nginx_bool(v: str) -> bool:
166+
if v == "0":
167+
return False
168+
if v == "1":
169+
return True
170+
raise UnexpectedProxyError(f"Cannot parse boolean value: expected '0' or '1', got {v!r}")

src/tests/_internal/proxy/gateway/services/test_stats.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
pytest.param(
1818
dedent(
1919
"""
20-
2024-12-06T12:08:00+00:00 srv-0.gtw.test 200 0.100
21-
2024-12-06T12:08:00+00:00 srv-1.gtw.test 200 1.100
22-
2024-12-06T12:09:15+00:00 srv-0.gtw.test 200 0.200
23-
2024-12-06T12:09:15+00:00 srv-1.gtw.test 200 1.200
24-
2024-12-06T12:09:45+00:00 srv-0.gtw.test 200 0.300
20+
2024-12-06T12:08:00+00:00 srv-0.gtw.test 200 0.100 1
21+
2024-12-06T12:08:00+00:00 srv-1.gtw.test 200 1.100 1
22+
2024-12-06T12:09:15+00:00 srv-0.gtw.test 200 0.200 1
23+
2024-12-06T12:09:15+00:00 srv-1.gtw.test 200 1.200 1
24+
2024-12-06T12:09:45+00:00 srv-0.gtw.test 200 0.300 1
2525
"""
2626
),
2727
{
@@ -41,11 +41,11 @@
4141
pytest.param(
4242
dedent(
4343
"""
44-
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.100
45-
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.200
46-
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.300
47-
2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.400
48-
2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.500
44+
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.100 1
45+
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.200 1
46+
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.300 1
47+
2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.400 1
48+
2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.500 1
4949
"""
5050
),
5151
{
@@ -60,10 +60,10 @@
6060
pytest.param(
6161
dedent(
6262
"""
63-
2024-12-06T12:04:50+00:00 srv.gtw.test 200 0.400
64-
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.300
65-
2024-12-06T12:09:15+00:00 srv.gtw.test 200 0.200
66-
2024-12-06T12:09:45+00:00 srv.gtw.test 200 0.100
63+
2024-12-06T12:04:50+00:00 srv.gtw.test 200 0.400 1
64+
2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.300 1
65+
2024-12-06T12:09:15+00:00 srv.gtw.test 200 0.200 1
66+
2024-12-06T12:09:45+00:00 srv.gtw.test 200 0.100 1
6767
"""
6868
),
6969
{
@@ -75,6 +75,23 @@
7575
},
7676
id="ignores-out-of-window",
7777
),
78+
pytest.param(
79+
dedent(
80+
"""
81+
2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.100 1
82+
2024-12-06T12:08:02+00:00 srv.gtw.test 200 0.200 0
83+
2024-12-06T12:08:03+00:00 srv.gtw.test 200 0.300 1
84+
"""
85+
),
86+
{
87+
"srv.gtw.test": {
88+
30: Stat(requests=0, request_time=0.0),
89+
60: Stat(requests=0, request_time=0.0),
90+
300: Stat(requests=2, request_time=0.2),
91+
},
92+
},
93+
id="ignores-replica-not-hit",
94+
),
7895
pytest.param(
7996
dedent(
8097
"""
@@ -93,7 +110,7 @@
93110
300: Stat(requests=4, request_time=0.25),
94111
},
95112
},
96-
id="ignores-irrelevant-statuses",
113+
id="ignores-irrelevant-statuses-in-legacy-pre-0.19.11-log",
97114
),
98115
pytest.param(
99116
"",

0 commit comments

Comments
 (0)