Skip to content

Commit a09a9bb

Browse files
fix(cmapi): MCOL-6091 CMAPI gives DMLProc only 10 seconds for a greceful stop
Fix default timeout for stop node, stop dml proc, shutdown Controller, put_config handler etc. All places that could cause reducing dmlproc graceful stop timeout are fixed: - cluster and node shutdown - stop_dmlproc - start_transaction - put_config - toggle_cluster_state
1 parent 90cac6b commit a09a9bb

File tree

8 files changed

+98
-49
lines changed

8 files changed

+98
-49
lines changed

cmapi/cmapi_server/constants.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,10 @@ class ProgInfo(NamedTuple):
116116
CMAPI_PORT = 8640 #TODO: use it in all places
117117
CURRENT_NODE_CMAPI_URL = f'https://localhost:{CMAPI_PORT}'
118118
REQUEST_TIMEOUT: float = 30.0
119-
TRANSACTION_TIMEOUT: float = 300.0 # 5 minutes
119+
120+
DMLPROC_SHUTDOWN_TIMEOUT: float = 300.0 # 5 minutes, should be less then LONG_REQUEST_TIMEOUT
121+
LONG_REQUEST_TIMEOUT: float = 400.0 # should be less than TRANSACTION_TIMEOUT
122+
TRANSACTION_TIMEOUT: float = 600.0 # 10 minutes
120123

121124
# API version
122125
_version = '0.4.0'

cmapi/cmapi_server/controllers/endpoints.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,18 @@
1717
from mcs_node_control.models.node_config import NodeConfig
1818
from mcs_node_control.models.node_status import NodeStatus
1919
from cmapi_server.constants import (
20-
CMAPI_PACKAGE_NAME, CMAPI_PORT, DEFAULT_MCS_CONF_PATH,
21-
DEFAULT_SM_CONF_PATH, EM_PATH_SUFFIX, MCS_BRM_CURRENT_PATH, MCS_EM_PATH,
22-
MDB_CS_PACKAGE_NAME, MDB_SERVER_PACKAGE_NAME, REQUEST_TIMEOUT,
23-
S3_BRM_CURRENT_PATH, SECRET_KEY,
20+
CMAPI_PACKAGE_NAME,
21+
CMAPI_PORT,
22+
DEFAULT_MCS_CONF_PATH,
23+
DMLPROC_SHUTDOWN_TIMEOUT,
24+
EM_PATH_SUFFIX,
25+
MCS_BRM_CURRENT_PATH,
26+
MCS_EM_PATH,
27+
MDB_CS_PACKAGE_NAME,
28+
MDB_SERVER_PACKAGE_NAME,
29+
REQUEST_TIMEOUT,
30+
S3_BRM_CURRENT_PATH,
31+
SECRET_KEY,
2432
)
2533
from cmapi_server.controllers.api_clients import NodeControllerClient
2634
from cmapi_server import helpers
@@ -728,7 +736,7 @@ def put_shutdown(self):
728736
req = cherrypy.request
729737
use_sudo = get_use_sudo(req.app.config)
730738
request_body = cherrypy.request.json
731-
timeout = request_body.get('timeout', 0)
739+
timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
732740
node_config = NodeConfig()
733741
try:
734742
MCSProcessManager.stop_node(
@@ -897,7 +905,7 @@ def put_shutdown(self):
897905

898906
request = cherrypy.request
899907
request_body = request.json
900-
timeout = request_body.get('timeout', None)
908+
timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
901909
force = request_body.get('force', False)
902910
config = request_body.get('config', DEFAULT_MCS_CONF_PATH)
903911
in_transaction = request_body.get('in_transaction', False)
@@ -907,7 +915,7 @@ def put_shutdown(self):
907915
with TransactionManager():
908916
response = ClusterHandler.shutdown(config, timeout)
909917
else:
910-
response = ClusterHandler.shutdown(config)
918+
response = ClusterHandler.shutdown(config, timeout)
911919
except CMAPIBasicError as err:
912920
raise_422_error(module_logger, func_name, err.message)
913921

@@ -1597,7 +1605,7 @@ def put_stop_dmlproc(self):
15971605

15981606
request = cherrypy.request
15991607
request_body = request.json
1600-
timeout = request_body.get('timeout', 10)
1608+
timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
16011609
force = request_body.get('force', False)
16021610

16031611
if force:

cmapi/cmapi_server/handlers/cluster.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616
from tracing.traced_session import get_traced_session
1717

1818
from cmapi_server.constants import (
19-
CMAPI_CONF_PATH, CMAPI_PORT, DEFAULT_MCS_CONF_PATH, REQUEST_TIMEOUT,
19+
CMAPI_CONF_PATH,
20+
CMAPI_PORT,
21+
DEFAULT_MCS_CONF_PATH,
22+
DMLPROC_SHUTDOWN_TIMEOUT,
23+
REQUEST_TIMEOUT,
2024
)
2125
from cmapi_server.exceptions import CMAPIBasicError, exc_to_cmapi_error
2226
from cmapi_server.controllers.api_clients import NodeControllerClient
@@ -44,7 +48,7 @@ class ClusterAction(Enum):
4448

4549

4650
def toggle_cluster_state(
47-
action: ClusterAction, config: str) -> dict:
51+
action: ClusterAction, config: str, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> dict:
4852
"""Toggle the state of the cluster (start or stop).
4953
5054
:param action: The cluster action to perform.
@@ -64,7 +68,7 @@ def toggle_cluster_state(
6468

6569
switch_node_maintenance(maintainance_flag)
6670
update_revision_and_manager()
67-
broadcast_new_config(config, distribute_secrets=True)
71+
broadcast_new_config(config, distribute_secrets=True, timeout=timeout)
6872

6973

7074
class ClusterHandler:
@@ -161,15 +165,15 @@ def start(config: str = DEFAULT_MCS_CONF_PATH) -> dict:
161165

162166
@staticmethod
163167
def shutdown(
164-
config: str = DEFAULT_MCS_CONF_PATH, timeout: Optional[int] = None
168+
config: str = DEFAULT_MCS_CONF_PATH, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT,
165169
) -> dict:
166170
"""Method to stop the MCS Cluster.
167171
168172
:param config: columnstore xml config file path,
169173
defaults to DEFAULT_MCS_CONF_PATH
170174
:type config: str, optional
171175
:param timeout: timeout in seconds to gracefully stop DMLProc,
172-
defaults to None
176+
defaults to DMLPROC_SHUTDOWN_TIMEOUT
173177
:type timeout: Optional[int], optional
174178
:raises CMAPIBasicError: if no nodes in the cluster
175179
:return: start timestamp
@@ -180,7 +184,7 @@ def shutdown(
180184
'Cluster shutdown command called. Shutting down the cluster.'
181185
)
182186
operation_start_time = str(datetime.now())
183-
toggle_cluster_state(ClusterAction.STOP, config)
187+
toggle_cluster_state(ClusterAction.STOP, config, timeout=timeout)
184188
logger.debug('Successfully finished shutting down the cluster.')
185189
return {'timestamp': operation_start_time}
186190

cmapi/cmapi_server/helpers.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,15 @@
2828
requests.packages.urllib3.disable_warnings() # pylint: disable=no-member
2929

3030
from cmapi_server.constants import (
31-
CMAPI_CONF_PATH, CMAPI_DEFAULT_CONF_PATH, DEFAULT_MCS_CONF_PATH,
32-
DEFAULT_SM_CONF_PATH, LOCALHOSTS, _version
31+
CMAPI_CONF_PATH,
32+
CMAPI_DEFAULT_CONF_PATH,
33+
DEFAULT_MCS_CONF_PATH,
34+
DEFAULT_SM_CONF_PATH,
35+
DMLPROC_SHUTDOWN_TIMEOUT,
36+
LOCALHOSTS,
37+
LONG_REQUEST_TIMEOUT,
38+
TRANSACTION_TIMEOUT,
39+
_version
3340
)
3441
from cmapi_server.handlers.cej import CEJPasswordHandler
3542
from cmapi_server.managers.process import MCSProcessManager
@@ -63,7 +70,7 @@ def start_transaction(
6370
remove_nodes: Optional[list] = None,
6471
optional_nodes: Optional[list] = None,
6572
txn_id: Optional[int] = None,
66-
timeout: float = 300.0
73+
timeout: float = TRANSACTION_TIMEOUT
6774
):
6875
"""Start internal CMAPI transaction.
6976
@@ -87,7 +94,7 @@ def start_transaction(
8794
:param txn_id: id for transaction to start, defaults to None
8895
:type txn_id: Optional[int], optional
8996
:param timeout: time in seconds for cmapi transaction lock before it ends
90-
automatically, defaults to 300
97+
automatically, defaults to TRANSACTION_TIMEOUT
9198
:type timeout: float, optional
9299
:return: (success, txn_id, nodes)
93100
:rtype: tuple[bool, int, list[str]]
@@ -324,8 +331,7 @@ def broadcast_new_config(
324331
defaults to DEFAULT_SM_CONF_PATH
325332
:param test_mode: for test purposes, defaults to False TODO: remove
326333
:param nodes: nodes list for config put, defaults to None
327-
:param timeout: timeout passing to gracefully stop DMLProc TODO: for next
328-
releases. Could affect all logic of broadcacting new config
334+
:param timeout: timeout passing to gracefully stop DMLProc process,
329335
:param distribute_secrets: flag to distribute secrets to nodes
330336
:param stateful_config_dict: stateful config update dict to distribute to nodes
331337
:raises CMAPIBasicError: If Broadcasting config to nodes failed with errors
@@ -341,7 +347,7 @@ def broadcast_new_config(
341347
headers = {'x-api-key': key}
342348
if stateful_config_dict:
343349
body = {
344-
'timeout': 300,
350+
'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout,
345351
'stateful_config_dict': stateful_config_dict,
346352
'only_stateful_config': True,
347353
}
@@ -357,7 +363,7 @@ def broadcast_new_config(
357363
body = {
358364
'manager': root.find('./ClusterManager').text,
359365
'revision': root.find('./ConfigRevision').text,
360-
'timeout': 300,
366+
'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout,
361367
'config': config_text,
362368
'mcs_config_filename': cs_config_filename,
363369
'sm_config_filename': sm_config_filename,
@@ -395,7 +401,7 @@ async def update_config(node: str, headers: dict, body: dict) -> None:
395401
async with create_traced_async_session() as session:
396402
try:
397403
async with session.put(
398-
url, headers=headers, json=body, ssl=False, timeout=120
404+
url, headers=headers, json=body, ssl=False, timeout=LONG_REQUEST_TIMEOUT
399405
) as response:
400406
resp_json = await response.json(encoding='utf-8')
401407
response.raise_for_status()

cmapi/cmapi_server/managers/process.py

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,18 @@
33
import logging
44
import os.path
55
import socket
6+
import time
67
from time import sleep
78

89
import psutil
910

10-
from cmapi_server.constants import ALL_MCS_PROGS, MCS_INSTALL_BIN, MCSProgs, ProgInfo
11+
from cmapi_server.constants import (
12+
ALL_MCS_PROGS,
13+
DMLPROC_SHUTDOWN_TIMEOUT,
14+
MCS_INSTALL_BIN,
15+
MCSProgs,
16+
ProgInfo,
17+
)
1118
from cmapi_server.exceptions import CMAPIBasicError
1219
from cmapi_server.process_dispatchers.base import BaseDispatcher
1320
from cmapi_server.process_dispatchers.container import ContainerDispatcher
@@ -238,32 +245,49 @@ def _wait_for_controllernode(cls) -> bool:
238245
return True
239246

240247
@classmethod
241-
def _wait_for_DMLProc_stop(cls, timeout: int = 10) -> bool:
248+
def _wait_for_DMLProc_stop(cls, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> bool:
242249
"""Waiting DMLProc process to stop.
243250
244-
:param timeout: timeout to wait, defaults to 10
251+
:param timeout: timeout to wait in seconds, defaults to DMLPROC_SHUTDOWN_TIMEOUT
245252
:type timeout: int, optional
246253
:return: True on success
247254
:rtype: bool
248255
"""
249256
logging.info(f'Waiting for DMLProc to stop in {timeout} seconds')
250-
dmlproc_stopped = False
251-
while timeout > 0:
252-
logging.info(
253-
f'Waiting for DMLProc to stop. Seconds left {timeout}.'
254-
)
257+
# Use a deadline-based loop with throttled logging to reduce noise.
258+
deadline = time.monotonic() + max(1, int(timeout))
259+
LOG_INTERVAL = 30 # seconds
260+
next_log_in = 0 # log immediately on first iteration
261+
262+
while True:
263+
remaining = int(deadline - time.monotonic())
264+
if remaining <= 0:
265+
break
266+
255267
if not Process.check_process_alive('DMLProc'):
256268
logging.info('DMLProc gracefully stopped by DBRM command.')
257-
dmlproc_stopped = True
258-
break
259-
sleep(1)
260-
timeout -= 1
261-
else:
262-
logging.error(
263-
f'DMLProc did not stopped gracefully by DBRM command within '
264-
f'{timeout} seconds. Will be stopped directly.'
265-
)
266-
return dmlproc_stopped
269+
return True
270+
271+
# Throttle waiting logs to roughly once every LOG_INTERVAL seconds
272+
if next_log_in <= 0:
273+
sleep_for = min(10, remaining)
274+
logging.info(
275+
(
276+
f'Waiting for DMLProc to stop. Seconds left ~{remaining}. '
277+
f'Sleeping {sleep_for} seconds before next check.'
278+
)
279+
)
280+
next_log_in = LOG_INTERVAL
281+
282+
sleep_for = min(10, remaining)
283+
sleep(sleep_for)
284+
next_log_in -= sleep_for
285+
286+
logging.error(
287+
'DMLProc didn\'t stop gracefully by DBRM command within '
288+
f'{timeout} seconds. Will be stopped directly.'
289+
)
290+
return False
267291

268292
@classmethod
269293
def noop(cls, *args, **kwargs):
@@ -324,7 +348,7 @@ def start(cls, name: str, is_primary: bool, use_sudo: bool) -> bool:
324348

325349
@classmethod
326350
def stop(
327-
cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = 10
351+
cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT
328352
) -> bool:
329353
"""Stop mcs process.
330354
@@ -455,7 +479,7 @@ def stop_node(
455479
cls,
456480
is_primary: bool,
457481
use_sudo: bool = True,
458-
timeout: int = 10,
482+
timeout: int = DMLPROC_SHUTDOWN_TIMEOUT,
459483
):
460484
"""Stop mcs node processes.
461485
@@ -472,7 +496,7 @@ def stop_node(
472496
# undefined behaviour when primary gone and then recovers (failover
473497
# triggered 2 times).
474498
for prog_name in cls._get_sorted_progs(is_primary=True, reverse=True):
475-
if not cls.stop(prog_name, is_primary, use_sudo):
499+
if not cls.stop(prog_name, is_primary, use_sudo, timeout=timeout):
476500
logging.error(f'Process "{prog_name}" not stopped properly.')
477501
raise CMAPIBasicError(f'Error while stopping "{prog_name}"')
478502

cmapi/cmapi_server/managers/upgrade/repo.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ def get_latest_tested_mdb_version(cls) -> str:
154154
:raises CMAPIBasicError: no latest version matched with latest tested
155155
:raises CMAPIBasicError: if request error
156156
:return: latest MDB version matched with latest tested major
157-
:rtype: str
158157
"""
159158
try:
160159
# Download the keyring file
@@ -174,7 +173,7 @@ def get_latest_tested_mdb_version(cls) -> str:
174173
)
175174
latest_version_num = sorted(latest_version_nums, reverse=True)[0]
176175
logging.debug(
177-
'Succesfully got latest MBD version number: '
176+
'Succesfully got latest MDB version number: '
178177
f'{latest_version_num}'
179178
)
180179
except requests.RequestException as exc:

cmapi/mcs_cluster_tool/cluster_app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def stop(
159159
# could affect put_config (helpers.py broadcast_config) operation
160160
timeout = 0
161161

162+
#TODO: bypass timeout here
162163
resp = client.shutdown_cluster({'in_transaction': True})
163164
return {'timestamp': start_time}
164165

cmapi/mcs_cluster_tool/tools_commands.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@
1919

2020

2121
from cmapi_server.constants import (
22-
MCS_DATA_PATH, MCS_SECRETS_FILENAME, REQUEST_TIMEOUT, TRANSACTION_TIMEOUT,
23-
CMAPI_CONF_PATH, CMAPI_PORT,
22+
CMAPI_CONF_PATH,
23+
CMAPI_PORT,
24+
MCS_DATA_PATH,
25+
MCS_SECRETS_FILENAME,
26+
REQUEST_TIMEOUT,
27+
TRANSACTION_TIMEOUT,
2428
)
2529
from cmapi_server.controllers.api_clients import (
2630
AppControllerClient, ClusterControllerClient, NodeControllerClient

0 commit comments

Comments
 (0)