feat: add engine/memory_format os env variables and delay engine initialization (#2285)

jaidisido · web-flow · commit c72e8e383faf · 2023-05-18T16:40:48.000+01:00
* feat: add engine/memory_format os env variables and delay engine initialization
diff --git a/adr/0009-lazy-engine-initialization.md b/adr/0009-lazy-engine-initialization.md
@@ -0,0 +1,22 @@
+# 9. Engine selection and lazy initialization
+
+Date: 2023-05-17
+
+## Status
+
+Accepted
+
+## Context
+
+In distributed mode, three approaches are possible when it comes to selecting and initializing a Ray engine:
+1. Initialize the Ray runtime at import (current default). This option causes the least friction to the user but assumes that installing Ray as an optional dependency is enough to enable distributed mode. Moreover, the user cannot prevent/delay Ray initialization (as it's done at import)
+2. Initialize the Ray runtime on the first distributed API call. The user can prevent Ray initialization by switching the engine/memory format with environment variables or between import and the first awswrangler distributed API call. However, by default this approach still assumes that installing Ray is equivalent to enabling distributed mode
+3. Wait for the user to enable distributed mode, via environment variables and/or via `wr.engine.set`. This option makes no assumption on which mode to use (distributed vs non-distributed). Non-distributed would be the default and it's up to the user to switch the engine/memory format
+
+## Decision
+
+Option #1 is inflexible and gives little control to the user, while option #3 introduces too much friction and puts the burden on the user. Option #2 on the other hand gives full flexibility to the user while providing a sane default.
+
+## Consequences
+
+The only difference between the current default and the suggested approach is to delay engine initialization, which is not a breaking change. However, it means that in certain situations more than one Ray instance is initialized. For instance, when running tests across multiple threads, each thread runs its own Ray runtime.
diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -36,7 +36,7 @@
 from awswrangler._config import config  # noqa
 from awswrangler._distributed import EngineEnum, MemoryFormatEnum, engine, memory_format  # noqa
 
-engine.initialize()
+engine.register()
 
 __all__ = [
     "athena",
diff --git a/awswrangler/_distributed.py b/awswrangler/_distributed.py
@@ -3,13 +3,17 @@
 # pylint: disable=import-outside-toplevel
 
 import importlib.util
+import os
 import threading
 from collections import defaultdict
 from enum import Enum, unique
 from functools import wraps
 from importlib import reload
 from typing import Any, Callable, Dict, Literal, Optional, TypeVar, cast
 
+WR_ENGINE = os.getenv("WR_ENGINE")
+WR_MEMORY_FORMAT = os.getenv("WR_MEMORY_FORMAT")
+
 
 @unique
 class EngineEnum(Enum):
@@ -35,7 +39,7 @@ class MemoryFormatEnum(Enum):
 class Engine:
     """Execution engine configuration class."""
 
-    _engine: Optional[EngineEnum] = None
+    _engine: Optional[EngineEnum] = EngineEnum[WR_ENGINE.upper()] if WR_ENGINE else None
     _initialized_engine: Optional[EngineEnum] = None
     _registry: Dict[EngineLiteral, Dict[str, Callable[..., Any]]] = defaultdict(dict)
     _lock: threading.RLock = threading.RLock()
@@ -73,9 +77,7 @@ def get(cls) -> EngineEnum:
     def set(cls, name: EngineLiteral) -> None:
         """Set the distribution engine."""
         with cls._lock:
-            cls._engine = EngineEnum._member_map_[  # type: ignore[assignment]  # pylint: disable=protected-access,no-member
-                name.upper()
-            ]
+            cls._engine = EngineEnum[name.upper()]
 
     @classmethod
     def dispatch_func(cls, source_func: FunctionType, value: Optional[EngineLiteral] = None) -> FunctionType:
@@ -99,6 +101,7 @@ def dispatch_on_engine(cls, func: FunctionType) -> FunctionType:
 
         @wraps(func)
         def wrapper(*args: Any, **kw: Dict[str, Any]) -> Any:
+            cls.initialize(name=cls.get().value)
             return cls.dispatch_func(func)(*args, **kw)
 
         # Save the original function
@@ -127,8 +130,7 @@ def initialize(cls, name: Optional[EngineLiteral] = None) -> None:
                 from awswrangler.distributed.ray import initialize_ray
 
                 initialize_ray()
-            cls.register(engine_name)
-            cls._initialized_engine = cls.get()
+            cls._initialized_engine = EngineEnum[engine_name.upper()]
 
     @classmethod
     def is_initialized(cls, name: Optional[EngineLiteral] = None) -> bool:
@@ -142,7 +144,7 @@ def is_initialized(cls, name: Optional[EngineLiteral] = None) -> bool:
 class MemoryFormat:
     """Memory format configuration class."""
 
-    _enum: Optional[MemoryFormatEnum] = None
+    _enum: Optional[MemoryFormatEnum] = MemoryFormatEnum[WR_MEMORY_FORMAT.upper()] if WR_MEMORY_FORMAT else None
     _lock: threading.RLock = threading.RLock()
 
     @classmethod
@@ -178,7 +180,7 @@ def get(cls) -> MemoryFormatEnum:
     def set(cls, name: EngineLiteral) -> None:
         """Set the memory format."""
         with cls._lock:
-            cls._enum = MemoryFormatEnum._member_map_[name.upper()]  # type: ignore[assignment]  # pylint: disable=protected-access,no-member
+            cls._enum = MemoryFormatEnum[name.upper()]
 
             _reload()
 
diff --git a/awswrangler/distributed/ray/_core.py b/awswrangler/distributed/ray/_core.py
@@ -164,7 +164,7 @@ def initialize_ray(
             address = ray_address
 
         if address:
-            _logger.info("Connecting to a Ray cluster at: %s", address)
+            _logger.info("Connecting to a Ray instance at: %s", address)
             ray.init(
                 address=address,
                 include_dashboard=include_dashboard,
@@ -193,5 +193,5 @@ def initialize_ray(
                     "env_vars": {var: os.environ.get(var) for var in ray_runtime_env_vars if os.environ.get(var)}
                 },
             }
-            _logger.info("Starting a Ray cluster")
+            _logger.info("Initializing a Ray instance")
             ray.init(**ray_init_kwargs)
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -25,6 +25,7 @@ API Reference
 * `Amazon Chime`_
 * `Typing`_
 * `Global Configurations`_
+* `Engine and Memory Format`_
 * `Distributed - Ray`_
 
 Amazon S3
@@ -482,6 +483,17 @@ Global Configurations
     reset
     to_pandas
 
+Engine and Memory Format
+-------------------------
+
+.. currentmodule:: awswrangler._distributed
+
+.. autosummary::
+    :toctree: stubs
+
+    Engine
+    MemoryFormat
+
 Distributed - Ray
 ---------------------
 
diff --git a/docs/source/scale.rst b/docs/source/scale.rst
@@ -16,24 +16,30 @@ Once installed, you can use the library in your code as usual:
 
     >>> import awswrangler as wr
 
-At import, SDK for pandas looks for an environmental variable called ``WR_ADDRESS``.
-If found, it is used to send commands to a remote cluster.
-If not found, a local Ray runtime is initialized on your machine instead.
-
+At import, SDK for pandas checks if ``ray`` and ``modin`` are in the installation path and enables distributed mode.
 To confirm that you are in distributed mode, run:
 
     >>> print(f"Execution Engine: {wr.engine.get()}")
     >>> print(f"Memory Format: {wr.memory_format.get()}")
 
 which show that both Ray and Modin are enabled as an execution engine and memory format, respectively.
+You can switch back to non-distributed mode at any point (See `Switching modes <scale.rst#switching-modes>`__ below).
+
+Initialization of the Ray cluster is lazy and only triggered when the first distributed API is executed.
+At that point, SDK for pandas looks for an environment variable called ``WR_ADDRESS``.
+If found, it is used to send commands to a remote cluster.
+If not found, a local Ray runtime is initialized on your machine instead.
+Alternatively, you can trigger Ray initialization with:
+
+    >>> wr.engine.initialize()
 
 In distributed mode, the same ``awswrangler`` APIs can now handle much larger datasets:
 
 .. code-block:: python
 
     # Read Parquet data (1.2 Gb Parquet compressed)
     df = wr.s3.read_parquet(
-        path=f"s3://amazon-reviews-pds/parquet/product_category={category.title()}/",
+        path=f"s3://amazon-reviews-pds/parquet/product_category=Toys/",
     )
 
     # Drop the customer_id column
@@ -135,6 +141,23 @@ This table lists the ``awswrangler`` APIs available in distributed mode (i.e. th
 |                   | ``unload``                   |       ✅         |
 +-------------------+------------------------------+------------------+
 
+Switching modes
+----------------
+The following commands showcase how to switch between distributed and non-distributed modes:
+
+.. code-block:: python
+
+    # Switch to non-distributed
+    wr.engine.set("python")
+    wr.memory_format.set("pandas")
+
+    # Switch to distributed
+    wr.engine.set("ray")
+    wr.memory_format.set("modin")
+
+Similarly, you can set the ``WR_ENGINE`` and ``WR_MEMORY_FORMAT`` environment variables
+to the desired engine and memory format, respectively.
+
 Caveats
 --------
 
diff --git a/tests/unit/test_distributed.py b/tests/unit/test_distributed.py
@@ -23,7 +23,7 @@ def wr() -> Iterator[ModuleType]:
     yield reload(awswrangler)
 
     # Reset for future tests
-    awswrangler.engine.initialize()
+    awswrangler.engine.register()
 
 
 @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available")
@@ -39,7 +39,7 @@ def test_engine_python(wr: ModuleType) -> None:
     assert wr.engine.get_installed() == EngineEnum.RAY
     assert wr.engine.get() == EngineEnum.RAY
 
-    wr.engine.initialize(EngineEnum.PYTHON.value)
+    wr.engine.set(EngineEnum.PYTHON.value)
 
     assert wr.engine.get() == EngineEnum.PYTHON