Skip to content

Commit 384dd56

Browse files
Moved locks cleanup into cmapi
1 parent c75a095 commit 384dd56

File tree

7 files changed

+75
-65
lines changed

7 files changed

+75
-65
lines changed

cmapi/cmapi_server/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ class ProgInfo(NamedTuple):
9292
LIBJEMALLOC_DEFAULT_PATH = os.path.join(MCS_DATA_PATH, 'libjemalloc.so.2')
9393
MCS_LOG_PATH = '/var/log/mariadb/columnstore'
9494

95+
# tools for BRM shmem lock inspection/reset
96+
SHMEM_LOCKS_PATH = os.path.join(MCS_INSTALL_BIN, 'mcs-shmem-locks')
97+
RESET_LOCKS_PATH = os.path.join(MCS_INSTALL_BIN, 'reset_locks')
9598

9699
# client constants
97100
CMAPI_PORT = 8640 #TODO: use it in all places

cmapi/cmapi_server/process_dispatchers/container.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from cmapi_server.constants import (
1414
IFLAG, LIBJEMALLOC_DEFAULT_PATH, MCS_INSTALL_BIN, ALL_MCS_PROGS,
1515
)
16+
from cmapi_server.process_dispatchers import utils as dispatcher_utils
1617
from cmapi_server.exceptions import CMAPIBasicError
1718
from cmapi_server.process_dispatchers.base import BaseDispatcher
1819

@@ -223,15 +224,7 @@ def stop(
223224
# These stale locks can occur if the controllernode couldn't stop correctly
224225
# and they cause mcs-savebrm.py to hang
225226

226-
logger.debug('Pre-stop: inspecting and resetting shmem locks.')
227-
prestop_path = os.path.join(MCS_INSTALL_BIN, 'mcs-prestop-workernode.sh')
228-
prestop_logpath = cls._create_mcs_process_logfile(
229-
'mcs-prestop-workernode.log'
230-
)
231-
with open(prestop_logpath, 'a', encoding='utf-8') as prestop_logfh:
232-
_success, _ = cls.exec_command(
233-
prestop_path, stdout=prestop_logfh
234-
)
227+
dispatcher_utils.reset_shmem_locks(logger)
235228

236229
# start mcs-savebrm.py before stoping workernode
237230
logger.debug('Waiting to save BRM.')

cmapi/cmapi_server/process_dispatchers/systemd.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import Union, Tuple
66

77
from cmapi_server.process_dispatchers.base import BaseDispatcher
8+
from cmapi_server.process_dispatchers import utils as dispatcher_utils
89

910

1011
class SystemdDispatcher(BaseDispatcher):
@@ -164,6 +165,11 @@ def stop(
164165
"""
165166
service_name = service
166167
if service_name == 'mcs-workernode':
168+
# Run pre-stop lock reset before saving BRM
169+
# These stale locks can occur if the controllernode couldn't stop correctly
170+
# and they cause mcs-savebrm.py to hang
171+
dispatcher_utils.reset_shmem_locks(logging.getLogger(__name__))
172+
167173
service_name = f'{service_name}@1.service {service_name}@2.service'
168174
cls._workernode_enable(False, use_sudo)
169175

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import logging
2+
import re
3+
from time import sleep
4+
from typing import Optional
5+
from cmapi_server.constants import SHMEM_LOCKS_PATH, RESET_LOCKS_PATH
6+
from cmapi_server.process_dispatchers.base import BaseDispatcher
7+
8+
9+
def parse_locks_num(cmd_output: str) -> int:
10+
"""Parse output of mcs-shmem-locks command."""
11+
active_total = 0
12+
for line in cmd_output.splitlines():
13+
m = re.search(r'^\s*(readers|writers)\s*=\s*(\d+)', line)
14+
if m:
15+
try:
16+
active_total += int(m.group(2))
17+
except ValueError:
18+
pass
19+
return active_total
20+
21+
22+
def get_active_shmem_locks_num(logger: logging.Logger) -> Optional[int]:
23+
"""Get number of active shmem locks."""
24+
cmd = f'{SHMEM_LOCKS_PATH} --lock-id 0'
25+
success, out = BaseDispatcher.exec_command(cmd)
26+
if not success:
27+
logger.error('Failed to inspect shmem locks (command failed)')
28+
return None
29+
if not out:
30+
logger.error('Failed to inspect shmem locks (empty output)')
31+
return None
32+
33+
logger.debug('Current lock state:\n%s', (out or '').strip())
34+
35+
return parse_locks_num(out)
36+
37+
38+
def reset_shmem_locks(logger: logging.Logger) -> None:
39+
"""Inspect and reset BRM shmem locks"""
40+
logger.debug('Inspecting and resetting shmem locks.')
41+
42+
# Get current lock state
43+
active_locks_num = get_active_shmem_locks_num(logger)
44+
if active_locks_num is None:
45+
return
46+
47+
# Reset if any read/write locks are active
48+
if active_locks_num > 0:
49+
logger.info('Detected active shmem locks (sum readers+writers=%d). Attempting reset.', active_locks_num)
50+
51+
# Reset locks
52+
success, out = BaseDispatcher.exec_command(f'{RESET_LOCKS_PATH} -s')
53+
if not success:
54+
logger.error('Failed to reset shmem locks (command failed)')
55+
return
56+
57+
# Check that locks were reset
58+
sleep(1)
59+
active_locks_num = get_active_shmem_locks_num(logger)
60+
if active_locks_num is not None and active_locks_num > 0:
61+
logger.error('Failed to reset shmem locks (locks are still active)')
62+
return
63+
else:
64+
logger.info('No active shmem locks detected.')

oam/install_scripts/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ set(SHMEM_FILE_GLOB "MCS-shm-")
163163
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py" @ONLY)
164164
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py" @ONLY)
165165
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcssavebrm.py" @ONLY)
166-
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-prestop-workernode.sh.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-prestop-workernode.sh" @ONLY)
167166
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog.in" "${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog" @ONLY)
168167

169168
columnstore_install_program(columnstore-post-install ${ENGINE_BINDIR})
@@ -175,7 +174,6 @@ columnstore_install_program(columnstoreSyslogSetup.sh ${ENGINE_BINDIR})
175174
columnstore_install_program(mcs-stop-controllernode.sh ${ENGINE_BINDIR})
176175
columnstore_install_program(mcs-loadbrm.py ${ENGINE_BINDIR})
177176
columnstore_install_program(mcs-savebrm.py ${ENGINE_BINDIR})
178-
columnstore_install_program(mcs-prestop-workernode.sh ${ENGINE_BINDIR})
179177
columnstore_install_program(mariadb-columnstore-start.sh ${ENGINE_BINDIR})
180178
columnstore_install_program(mariadb-columnstore-stop.sh ${ENGINE_BINDIR})
181179
columnstore_install_program(loop_process_starter.sh ${ENGINE_BINDIR})

oam/install_scripts/mcs-prestop-workernode.sh.in

Lines changed: 0 additions & 53 deletions
This file was deleted.

oam/install_scripts/mcs-workernode.service.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ LimitCORE=@CORE_DUMPS@
1313

1414
Environment="@WORKERNODE_ALLOC_CONFIG@"
1515
ExecStart=@ENGINE_BINDIR@/workernode DBRM_Worker%i
16-
ExecStop=/usr/bin/env bash -c "@ENGINE_BINDIR@/mcs-prestop-workernode.sh; /bin/kill -TERM $MAINPID"
1716
ExecStopPost=@ENGINE_BINDIR@/mcs-savebrm.py
1817
ExecStopPost=/usr/bin/env bash -c "clearShm > /dev/null 2>&1"
1918

0 commit comments

Comments
 (0)