adding more changes

Signed-off-by: Mahesh Shetty <[email protected]>
red-hat-storage · Oct 24, 2024 · d755020 · d755020
1 parent 30d703f
commit d755020
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 44 deletions.
diff --git a/ocs_ci/ocs/resources/stretchcluster.py b/ocs_ci/ocs/resources/stretchcluster.py
@@ -202,28 +202,36 @@ def check_for_read_pause(self, label, start_time, end_time):
 
         """
         paused = 0
+        max_fail_expected = len(self.workload_map[label][0]) - 2
+        failed = 0
         for pod_obj in self.workload_map[label][0]:
-            if get_pod_node(pod_obj).name in self.non_quorum_nodes:
-                logger.info(
-                    f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
+            # if get_pod_node(pod_obj).name in self.non_quorum_nodes:
+            #     logger.info(
+            #         f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
+            #     )
+            #     continue
+            try:
+                pause_count = 0
+                time_var = start_time
+                pod_log = get_pod_logs(
+                    pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
                 )
-                continue
-            pause_count = 0
-            time_var = start_time
-            pod_log = get_pod_logs(
-                pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
-            )
-            logger.info(f"Current pod: {pod_obj.name}")
-            while time_var <= (end_time + timedelta(minutes=1)):
-                t_time = time_var.strftime("%H:%M")
-                if f" {t_time}" not in pod_log:
-                    pause_count += 1
-                    logger.info(f"Read pause: {t_time}")
+                logger.info(f"Current pod: {pod_obj.name}")
+                while time_var <= (end_time + timedelta(minutes=1)):
+                    t_time = time_var.strftime("%H:%M")
+                    if f" {t_time}" not in pod_log:
+                        pause_count += 1
+                        logger.info(f"Read pause: {t_time}")
+                    else:
+                        logger.info(f"Read success: {t_time}")
+                    time_var = time_var + timedelta(minutes=1)
+                if pause_count > 5:
+                    paused += 1
+            except CommandFailed:
+                if failed <= max_fail_expected:
+                    failed += 1
                 else:
-                    logger.info(f"Read success: {t_time}")
-                time_var = time_var + timedelta(minutes=1)
-            if pause_count > 5:
-                paused += 1
+                    raise
         return paused
 
     @retry(CommandFailed, tries=6, delay=10)
@@ -241,13 +249,19 @@ def check_for_write_pause(self, label, start_time, end_time):
 
         """
         paused = 0
+        max_fail_expected = (
+            len(self.workload_map[label][0]) - 2
+            if label == constants.LOGWRITER_CEPHFS_LABEL
+            else 1
+        )
+        failed = 0
         for pod_obj in self.workload_map[label][0]:
-            if get_pod_node(pod_obj).name in self.non_quorum_nodes:
-                logger.info(
-                    f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
-                )
-                continue
-            excepted = 0
+            # if get_pod_node(pod_obj).name in self.non_quorum_nodes:
+            #     logger.info(
+            #         f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
+            #     )
+            #     continue
+            no_such_file_expected = 1
             for file_name in self.logfile_map[label][0]:
                 pause_count = 0
                 try:
@@ -269,13 +283,16 @@ def check_for_write_pause(self, label, start_time, end_time):
                         "No such file or directory" in err.args[0]
                         and label == constants.LOGWRITER_RBD_LABEL
                     ):
-                        if excepted == 0:
+                        if no_such_file_expected == 1:
                             logger.info(
                                 f"Seems like file {file_name} is not in RBD pod {pod_obj.name}"
                             )
-                            excepted += 1
+                            no_such_file_expected += 1
                         else:
                             raise UnexpectedBehaviour
+                        failed += 1
+                    elif failed <= max_fail_expected:
+                        failed += 1
                     else:
                         raise
 
@@ -469,7 +486,10 @@ def check_ceph_accessibility(self, timeout, delay=5, grace=120):
             if "TimeoutExpired" in err.args[0]:
                 logger.error("Ceph status check got timed out. maybe ceph is hung.")
                 return False
-            elif "connect: no route to host" in err.args[0]:
+            elif (
+                "connect: no route to host" in err.args[0]
+                or "error dialing backend" in err.args[0]
+            ):
                 ceph_tools_pod.delete(wait=False)
             raise
 
@@ -634,7 +654,7 @@ def cephfs_failure_checks(
             self.check_for_read_pause(
                 constants.LOGREADER_CEPHFS_LABEL, start_time, end_time
             )
-            == 0
+            <= 2
         ), "Read operations are paused for CephFS workloads even for the ones in available zones"
         logger.info("All read operations are successful for CephFs workload")
 
@@ -653,7 +673,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs):
                 start_time,
                 end_time,
             )
-            == 0
+            <= 1
         ), "Write operations paused for RBD workloads even for the ones in available zone"
         logger.info("all write operations are successful for RBD workloads")
 

diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py b/tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py
@@ -84,14 +84,14 @@ def finalizer():
     @pytest.mark.parametrize(
         argnames="zones, duration",
         argvalues=[
-            # pytest.param(
-            #     constants.NETSPLIT_DATA_1_DATA_2,
-            #     30,
-            #     marks=[
-            #         pytest.mark.polarion_id("OCS-5069"),
-            #         pytest.mark.polarion_id("OCS-5071"),
-            #     ],
-            # ),
+            pytest.param(
+                constants.NETSPLIT_DATA_1_DATA_2,
+                30,
+                marks=[
+                    pytest.mark.polarion_id("OCS-5069"),
+                    pytest.mark.polarion_id("OCS-5071"),
+                ],
+            ),
             pytest.param(
                 constants.NETSPLIT_ARBITER_DATA_1,
                 15,
@@ -118,7 +118,7 @@ def finalizer():
             ),
         ],
         ids=[
-            # "Data-1-Data-2",
+            "Data-1-Data-2",
             "Arbiter-Data-1",
             "Arbiter-Data-1-and-Arbiter-Data-2",
             "Arbiter-Data-1-and-Data-1-Data-2",
@@ -197,11 +197,11 @@ def test_netsplit(
 
         # get the nodes which are present in the
         # out of quorum zone
-        if (
-            zones != constants.NETSPLIT_ARBITER_DATA_1
-            and zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
-        ):
-            sc_obj.get_out_of_quorum_nodes()
+        # if (
+        #     zones != constants.NETSPLIT_ARBITER_DATA_1
+        #     and zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
+        # ):
+        #     sc_obj.get_out_of_quorum_nodes()
 
         # check for ceph accessibility and note the end time (UTC)
         timeout = (end_time - datetime.now(timezone.utc)).total_seconds()