Skip to content

Commit

Permalink
adding more changes
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh Shetty <[email protected]>
  • Loading branch information
Mahesh Shetty authored and Mahesh Shetty committed Oct 24, 2024
1 parent 30d703f commit d755020
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 44 deletions.
80 changes: 50 additions & 30 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,28 +202,36 @@ def check_for_read_pause(self, label, start_time, end_time):
"""
paused = 0
max_fail_expected = len(self.workload_map[label][0]) - 2
failed = 0
for pod_obj in self.workload_map[label][0]:
if get_pod_node(pod_obj).name in self.non_quorum_nodes:
logger.info(
f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
# if get_pod_node(pod_obj).name in self.non_quorum_nodes:
# logger.info(
# f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
# )
# continue
try:
pause_count = 0
time_var = start_time
pod_log = get_pod_logs(
pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
continue
pause_count = 0
time_var = start_time
pod_log = get_pod_logs(
pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
logger.info(f"Current pod: {pod_obj.name}")
while time_var <= (end_time + timedelta(minutes=1)):
t_time = time_var.strftime("%H:%M")
if f" {t_time}" not in pod_log:
pause_count += 1
logger.info(f"Read pause: {t_time}")
logger.info(f"Current pod: {pod_obj.name}")
while time_var <= (end_time + timedelta(minutes=1)):
t_time = time_var.strftime("%H:%M")
if f" {t_time}" not in pod_log:
pause_count += 1
logger.info(f"Read pause: {t_time}")
else:
logger.info(f"Read success: {t_time}")
time_var = time_var + timedelta(minutes=1)
if pause_count > 5:
paused += 1
except CommandFailed:
if failed <= max_fail_expected:
failed += 1
else:
logger.info(f"Read success: {t_time}")
time_var = time_var + timedelta(minutes=1)
if pause_count > 5:
paused += 1
raise
return paused

@retry(CommandFailed, tries=6, delay=10)
Expand All @@ -241,13 +249,19 @@ def check_for_write_pause(self, label, start_time, end_time):
"""
paused = 0
max_fail_expected = (
len(self.workload_map[label][0]) - 2
if label == constants.LOGWRITER_CEPHFS_LABEL
else 1
)
failed = 0
for pod_obj in self.workload_map[label][0]:
if get_pod_node(pod_obj).name in self.non_quorum_nodes:
logger.info(
f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
)
continue
excepted = 0
# if get_pod_node(pod_obj).name in self.non_quorum_nodes:
# logger.info(
# f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
# )
# continue
no_such_file_expected = 1
for file_name in self.logfile_map[label][0]:
pause_count = 0
try:
Expand All @@ -269,13 +283,16 @@ def check_for_write_pause(self, label, start_time, end_time):
"No such file or directory" in err.args[0]
and label == constants.LOGWRITER_RBD_LABEL
):
if excepted == 0:
if no_such_file_expected == 1:
logger.info(
f"Seems like file {file_name} is not in RBD pod {pod_obj.name}"
)
excepted += 1
no_such_file_expected += 1
else:
raise UnexpectedBehaviour
failed += 1
elif failed <= max_fail_expected:
failed += 1
else:
raise

Expand Down Expand Up @@ -469,7 +486,10 @@ def check_ceph_accessibility(self, timeout, delay=5, grace=120):
if "TimeoutExpired" in err.args[0]:
logger.error("Ceph status check got timed out. maybe ceph is hung.")
return False
elif "connect: no route to host" in err.args[0]:
elif (
"connect: no route to host" in err.args[0]
or "error dialing backend" in err.args[0]
):
ceph_tools_pod.delete(wait=False)
raise

Expand Down Expand Up @@ -634,7 +654,7 @@ def cephfs_failure_checks(
self.check_for_read_pause(
constants.LOGREADER_CEPHFS_LABEL, start_time, end_time
)
== 0
<= 2
), "Read operations are paused for CephFS workloads even for the ones in available zones"
logger.info("All read operations are successful for CephFs workload")

Expand All @@ -653,7 +673,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs):
start_time,
end_time,
)
== 0
<= 1
), "Write operations paused for RBD workloads even for the ones in available zone"
logger.info("all write operations are successful for RBD workloads")

Expand Down
28 changes: 14 additions & 14 deletions tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ def finalizer():
@pytest.mark.parametrize(
argnames="zones, duration",
argvalues=[
# pytest.param(
# constants.NETSPLIT_DATA_1_DATA_2,
# 30,
# marks=[
# pytest.mark.polarion_id("OCS-5069"),
# pytest.mark.polarion_id("OCS-5071"),
# ],
# ),
pytest.param(
constants.NETSPLIT_DATA_1_DATA_2,
30,
marks=[
pytest.mark.polarion_id("OCS-5069"),
pytest.mark.polarion_id("OCS-5071"),
],
),
pytest.param(
constants.NETSPLIT_ARBITER_DATA_1,
15,
Expand All @@ -118,7 +118,7 @@ def finalizer():
),
],
ids=[
# "Data-1-Data-2",
"Data-1-Data-2",
"Arbiter-Data-1",
"Arbiter-Data-1-and-Arbiter-Data-2",
"Arbiter-Data-1-and-Data-1-Data-2",
Expand Down Expand Up @@ -197,11 +197,11 @@ def test_netsplit(

# get the nodes which are present in the
# out of quorum zone
if (
zones != constants.NETSPLIT_ARBITER_DATA_1
and zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
):
sc_obj.get_out_of_quorum_nodes()
# if (
# zones != constants.NETSPLIT_ARBITER_DATA_1
# and zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
# ):
# sc_obj.get_out_of_quorum_nodes()

# check for ceph accessibility and note the end time (UTC)
timeout = (end_time - datetime.now(timezone.utc)).total_seconds()
Expand Down

0 comments on commit d755020

Please sign in to comment.