From f0e8a90fdf073f964c3a5e11412e36cc655e1e81 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 6 Mar 2025 11:09:25 +0100 Subject: [PATCH 01/37] fix run_experiments --- deathstar_movie_review/demo.py | 7 +-- deathstar_movie_review/entities/frontend.py | 4 +- deathstar_movie_review/start_benchmark.py | 35 +++++++++++---- run_experiments.py | 48 +++++++++++++++------ 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 60a623b..4b05261 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -57,6 +57,8 @@ def main(): runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) runtime.init(kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10) + + print(f"Creating dataflow [{EXPERIMENT}]") if EXPERIMENT == "baseline": frontend_op.dataflow = frontend_df_serial() @@ -65,9 +67,8 @@ def main(): dead_node_elimination([], [frontend_op]) elif EXPERIMENT == "parallel": frontend_op.dataflow = frontend_df_parallel() - - print(frontend_op.dataflow.to_dot()) - print(f"Creating dataflow [{EXPERIMENT}]") + else: + raise RuntimeError(f"EXPERIMENT is not set correctly: {EXPERIMENT}") runtime.add_operator(compose_review_op) runtime.add_operator(user_op) diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py index db75bc2..da1976f 100644 --- a/deathstar_movie_review/entities/frontend.py +++ b/deathstar_movie_review/entities/frontend.py @@ -1,8 +1,10 @@ -from typing import Any +import os +from typing import Any, Literal import uuid from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode, StatelessOpNode from cascade.dataflow.operator import StatelessOperator +from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from deathstar_movie_review.entities.compose_review import ComposeReview from deathstar_movie_review.entities.movie import MovieId from deathstar_movie_review.entities.user import User diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index 7664b86..088bce9 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -1,8 +1,10 @@ import hashlib import time +from typing import Literal import uuid import pandas as pd import random + from .movie_data import movie_data from .workload_data import movie_titles, charset import sys @@ -13,11 +15,12 @@ # import cascade sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) +from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from cascade.dataflow.dataflow import Event, EventResult, InitClass, OpNode from cascade.runtime.flink_runtime import FlinkClientSync from .entities.user import User -from .entities.frontend import frontend_op +from .entities.frontend import frontend_df_parallel, frontend_df_serial, frontend_op from .entities.movie import MovieInfo, Plot, MovieId IN_TOPIC = "ds-movie-in" @@ -75,7 +78,7 @@ def populate_movie(client: FlinkClientSync): client.send(event) -def compose_review(req_id): +def compose_review(req_id, op): user_index = random.randint(0, 999) username = f"username_{user_index}" password = f"password_{user_index}" @@ -83,7 +86,7 @@ def compose_review(req_id): rating = random.randint(0, 10) text = ''.join(random.choice(charset) for _ in range(256)) - return frontend_op.dataflow.generate_event({ + return op.dataflow.generate_event({ "review": req_id, "user": username, "title": title, @@ -91,17 +94,17 @@ def compose_review(req_id): "text": text }) -def deathstar_workload_generator(): +def deathstar_workload_generator(op): c = 1 while True: - yield compose_review(c) + yield compose_review(c, op) c += 1 -def benchmark_runner(proc_num, messages_per_burst, sleeps_per_burst, sleep_time, seconds_per_burst, bursts) -> dict[int, dict]: +def benchmark_runner(proc_num, op, messages_per_burst, sleeps_per_burst, sleep_time, seconds_per_burst, bursts) -> dict[int, dict]: print(f'Generator: {proc_num} starting') client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) - deathstar_generator = deathstar_workload_generator() + deathstar_generator = deathstar_workload_generator(op) start = timer() for b in range(bursts): @@ -190,10 +193,26 @@ def main(): parser.add_argument("--sleep_time", type=float, default=0.08, help="Sleep time between messages") parser.add_argument("--seconds_per_burst", type=int, default=1, help="Seconds per burst") parser.add_argument("--bursts", type=int, default=100, help="Number of bursts") + parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") args = parser.parse_args() + EXPERIMENT = args.experiment + + print(f"Experiment [{EXPERIMENT}]") print(f"Starting with args:\n{args}") + + if EXPERIMENT == "baseline": + frontend_op.dataflow = frontend_df_serial() + elif EXPERIMENT == "pipelined": + frontend_op.dataflow = frontend_df_serial() + dead_node_elimination([], [frontend_op]) + elif EXPERIMENT == "parallel": + frontend_op.dataflow = frontend_df_parallel() + else: + raise RuntimeError(f"EXPERIMENT is not set correctly: {EXPERIMENT}") + + init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) print("Populating...") @@ -210,7 +229,7 @@ def main(): # results = p.map(benchmark_runner, range(threads)) # results = {k: v for d in results for k, v in d.items()} - results = benchmark_runner(0, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) + results = benchmark_runner(0, frontend_op, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) print("last result:") print(list(results.values())[-1]) diff --git a/run_experiments.py b/run_experiments.py index 3cf327e..3c1137c 100755 --- a/run_experiments.py +++ b/run_experiments.py @@ -10,6 +10,13 @@ "bursts": 100 } +mps_1 = { + **args, + "messages_per_burst": 1, + "sleeps_per_burst": 1, + "sleep_time": 0.8, +} + mps_20 = { **args, "messages_per_burst": 20, @@ -26,26 +33,39 @@ # Define experiment parameters as a list of dictionaries experiments = [ + {"parallelism": 16, "benchmark_args": {**mps_1}}, {"parallelism": 16, "benchmark_args": {**args}}, - {"parallelism": 16, "benchmark_args": {**mps_20}}, - {"parallelism": 16, "benchmark_args": {**mps_50}}, + {"parallelism": 8, "benchmark_args": {**mps_1}}, {"parallelism": 8, "benchmark_args": {**args}}, - {"parallelism": 8, "benchmark_args": {**mps_20}}, - {"parallelism": 4, "benchmark_args": {**mps_20}}, + {"parallelism": 4, "benchmark_args": {**mps_1}}, {"parallelism": 4, "benchmark_args": {**args}}, + {"parallelism": 2, "benchmark_args": {**mps_1}}, {"parallelism": 2, "benchmark_args": {**args}}, - {"parallelism": 2, "benchmark_args": {**mps_20}}, - + + {"parallelism": 1, "benchmark_args": {**mps_1}}, {"parallelism": 1, "benchmark_args": {**args}}, - {"parallelism": 1, "benchmark_args": {**mps_20}}, + # {"parallelism": 16, "benchmark_args": {**mps_20}}, + # {"parallelism": 16, "benchmark_args": {**mps_50}}, + + # {"parallelism": 8, "benchmark_args": {**args}}, + # {"parallelism": 8, "benchmark_args": {**mps_20}}, + + # {"parallelism": 4, "benchmark_args": {**mps_20}}, + # {"parallelism": 4, "benchmark_args": {**args}}, + + # {"parallelism": 2, "benchmark_args": {**args}}, + # {"parallelism": 2, "benchmark_args": {**mps_20}}, + + # {"parallelism": 1, "benchmark_args": {**args}}, + # {"parallelism": 1, "benchmark_args": {**mps_20}}, - {"parallelism": 8, "benchmark_args": {**mps_50}}, - {"parallelism": 4, "benchmark_args": {**mps_50}}, - {"parallelism": 2, "benchmark_args": {**mps_50}}, - {"parallelism": 1, "benchmark_args": {**mps_50}}, + # {"parallelism": 8, "benchmark_args": {**mps_50}}, + # {"parallelism": 4, "benchmark_args": {**mps_50}}, + # {"parallelism": 2, "benchmark_args": {**mps_50}}, + # {"parallelism": 1, "benchmark_args": {**mps_50}}, ] @@ -54,7 +74,7 @@ print("Tearing down docker containers") subprocess.run(["docker", "compose", "down"], check=True) -for e in ["parallel", "base", "piplined"]: +for e in ["pipelined", "parallel", "baseline"]: for exp in experiments: print(f"Starting experiment {exp}") @@ -74,9 +94,9 @@ subprocess.run(flink_cmd, check=True, env=env) # Start benchmark - filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['messages_per_burst']}.plk" + filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['messages_per_burst']}.pkl" benchmark_cmd = [ - "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename + "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename, "--experiment", e ] for arg, val in exp['benchmark_args'].items(): From 174d85b3595900ba7efaf9112685aceae7802128 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Fri, 7 Mar 2025 16:39:07 +0100 Subject: [PATCH 02/37] Run python in thread mode --- src/cascade/runtime/flink_runtime.py | 54 +++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index 5afd53f..be1925b 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -27,6 +27,9 @@ # Required if SelectAll nodes are used SELECT_ALL_ENABLED = False +# Add profiling information to metadata +PROFILE = True + @dataclass class FlinkRegisterKeyNode(Node): """A node that will register a key with the SelectAll operator. @@ -56,6 +59,7 @@ def open(self, runtime_context: RuntimeContext): self.state: ValueState = runtime_context.get_state(descriptor) def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + event = profile_event(event, "STATEFUL OP INNER ENTRY") # should be handled by filters on this FlinkOperator assert(isinstance(event.target, OpNode)) @@ -123,6 +127,7 @@ def __init__(self, operator: StatelessOperator) -> None: def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + event = profile_event(event, "STATELESS OP INNER ENTRY") # should be handled by filters on this FlinkOperator assert(isinstance(event.target, StatelessOpNode)) @@ -206,6 +211,8 @@ def open(self, runtime_context: RuntimeContext): self.collection = runtime_context.get_state(descriptor) def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + event = profile_event(event, "COLLECT OP INNER ENTRY") + collection: list[Result] = self.collection.value() logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processing: {event}") @@ -307,6 +314,15 @@ def timestamp_event(e: Event) -> Event: pass return e +def profile_event(e: Event, ts_name: str) -> Event: + if not PROFILE: + return e + t1 = time.time() + if "prof" not in e.metadata: + e.metadata["prof"] = [] + e.metadata["prof"].append((ts_name, t1)) + return e + def timestamp_result(e: EventResult) -> EventResult: t1 = time.time() e.metadata["out_t"] = t1 @@ -370,10 +386,15 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_string("rest.port", str(self.ui_port)) config.set_integer("python.fn-execution.bundle.time", bundle_time) config.set_integer("python.fn-execution.bundle.size", bundle_size) + + config.set_string("python.execution-mode", "thread") + config.set_boolean("python.metric.enabled", False) # optimize for low latency # config.set_integer("taskmanager.memory.managed.size", 0) - config.set_integer("execution.buffer-timeout", 5) + config.set_string("execution.batch-shuffle-mode", "ALL_EXCHANGES_PIPELINED") + # config.set_integer("execution.buffer-timeout.interval", 0) + config.set_string("execution.buffer-timeout", "0 ms") kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__)), @@ -453,7 +474,7 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para ) .map(lambda x: deserialize_and_timestamp(x)) .name("DESERIALIZE internal") - ) + ).map(lambda e: profile_event(e, "DESERIALIZE DONE")) # Events with a `SelectAllNode` will first be processed by the select # all operator, which will send out multiple other Events that can @@ -488,11 +509,14 @@ def add_operator(self, op: StatefulOperator): flink_op = FlinkOperator(op) op_stream = ( - self.stateful_op_stream.filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) + self.stateful_op_stream + .map(lambda e: profile_event(e, "STATEFUL OP FILTER: " + flink_op.operator.entity.__name__)) + .filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) + .map(lambda e: profile_event(e, "STATEFUL OP ENTRY: " + flink_op.operator.entity.__name__)) .key_by(lambda e: e.variable_map[e.target.read_key_from]) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.entity.__name__) - ) + ).map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) self.stateful_op_streams.append(op_stream) def add_stateless_operator(self, op: StatelessOperator): @@ -501,10 +525,13 @@ def add_stateless_operator(self, op: StatelessOperator): op_stream = ( self.stateless_op_stream + .map(lambda e: profile_event(e, "STATELESS OP FILTER: " + flink_op.operator.dataflow.name)) .filter(lambda e: isinstance(e.target, StatelessOpNode) and e.target.operator.dataflow.name == flink_op.operator.dataflow.name) + .map(lambda e: profile_event(e, "STATELESS OP ENTRY: " + flink_op.operator.dataflow.name)) .process(flink_op) .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) - ) + ).map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) + self.stateless_op_streams.append(op_stream) def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="kafka") -> Union[CloseableIterator, None]: @@ -520,11 +547,11 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka if len(self.stateful_op_streams) >= 1: s1 = self.stateful_op_streams[0] rest = self.stateful_op_streams[1:] - operator_streams = s1.union(*rest, *self.stateless_op_streams) + operator_streams = s1.union(*rest, *self.stateless_op_streams).map(lambda e: profile_event(e, "OP STREAM UNION")) elif len(self.stateless_op_streams) >= 1: s1 = self.stateless_op_streams[0] rest = self.stateless_op_streams[1:] - operator_streams = s1.union(*rest, *self.stateful_op_streams) + operator_streams = s1.union(*rest, *self.stateful_op_streams).map(lambda e: profile_event(e, "OP STREAM UNION")) else: raise RuntimeError("No operators found, were they added to the flink runtime with .add_*_operator()") @@ -537,12 +564,14 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" # union with EventResults or Events that don't have a CollectNode target - ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode)))) + ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode)))).map(lambda e: profile_event(e, "MERGE UNION")) + # Output the stream results = ( ds .filter(lambda e: isinstance(e, EventResult)) + .map(lambda e: profile_event(e, "EXTERNAL SINK")) .map(lambda e: timestamp_result(e)) ) if output == "collect": @@ -554,7 +583,14 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka else: raise ValueError(f"Invalid output: {output}") - ds_internal = ds.filter(lambda e: isinstance(e, Event)).map(lambda e: timestamp_event(e)).sink_to(self.kafka_internal_sink).name("INTERNAL KAFKA SINK") + ds_internal = ( + ds + .filter(lambda e: isinstance(e, Event)) + .map(lambda e: profile_event(e, "INTERNAL SINK")) + .map(lambda e: timestamp_event(e)) + .sink_to(self.kafka_internal_sink) + .name("INTERNAL KAFKA SINK") + ) if run_async: logger.debug("FlinkRuntime starting (async)") From 76f01d7e225f1609d86ab0a75714dd314d9aea05 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 11 Mar 2025 14:51:00 +0100 Subject: [PATCH 03/37] GIL workaround exp --- run_experiments_gil_workaround.py | 126 ++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100755 run_experiments_gil_workaround.py diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py new file mode 100755 index 0000000..a890c80 --- /dev/null +++ b/run_experiments_gil_workaround.py @@ -0,0 +1,126 @@ +import os +import subprocess +import time + +args = { + "messages_per_burst": 10, + "sleeps_per_burst": 10, + "sleep_time": 0.09, + "seconds_per_burst": 1, + "bursts": 100 +} + +mps_1 = { + **args, + "messages_per_burst": 1, + "sleeps_per_burst": 1, + "sleep_time": 0.9, +} + +mps_20 = { + **args, + "messages_per_burst": 20, + "sleeps_per_burst": 20, + "sleep_time": 0.09/2, +} + +mps_30 = { + **args, + "messages_per_burst": 30, + "sleeps_per_burst": 30, + "sleep_time": 0.09/3, +} + +mps_50 = { + **args, + "messages_per_burst": 50, + "sleeps_per_burst": 50, + "sleep_time": 0.09/5, +} + +mps_100 = { + **args, + "messages_per_burst": 100, + "sleeps_per_burst": 100, + "sleep_time": 0.09/10, +} + +mps_500 = { + **args, + "messages_per_burst": 500, + "sleeps_per_burst": 500, + "sleep_time": 0.09/50, +} + + +# Define experiment parameters as a list of dictionaries +experiments = [ + # {"parallelism": 16, "benchmark_args": {**args}}, + # {"parallelism": 8, "benchmark_args": {**args}}, + # {"parallelism": 4, "benchmark_args": {**args}}, + # {"parallelism": 2, "benchmark_args": {**args}}, + # {"parallelism": 1, "benchmark_args": {**args}}, + + # {"parallelism": 16, "benchmark_args": {**mps_20}}, + # {"parallelism": 8, "benchmark_args": {**mps_20}}, + # {"parallelism": 4, "benchmark_args": {**mps_20}}, + # {"parallelism": 2, "benchmark_args": {**mps_20}}, + # {"parallelism": 1, "benchmark_args": {**mps_20}}, + + {"parallelism": 16, "benchmark_args": {**mps_500}}, + # {"parallelism": 32, "benchmark_args": {**mps_500}}, + # {"parallelism": 8, "benchmark_args": {**mps_50}}, + # {"parallelism": 4, "benchmark_args": {**mps_50}}, + # {"parallelism": 2, "benchmark_args": {**mps_50}}, + # {"parallelism": 1, "benchmark_args": {**mps_50}}, +] + + + + +print("Tearing down docker containers") +subprocess.run(["docker", "compose", "down"], check=True) + +# for e in ["pipelined", "parallel", "baseline"]: +for e in ["parallel"]: + for exp in experiments: + print(f"Starting experiment {exp}") + + # Start docker compose + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}"], check=True, env={ + "TASK_SLOTS": "1" + }) + + time.sleep(10) + + # Run Flink job + + flink_cmd = [ + "flink", "run", "--pyFiles", "/home/lvanmol/cascade/src,/home/lvanmol/cascade", + "--pyModule", "deathstar_movie_review.demo", "-d", "-p", str(exp['parallelism']) + ] + env = os.environ + env["EXPERIMENT"] = e + subprocess.run(flink_cmd, check=True, env=env) + + # Start benchmark + filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['messages_per_burst']}.pkl" + benchmark_cmd = [ + "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename, "--experiment", e + ] + + for arg, val in exp['benchmark_args'].items(): + benchmark_cmd.append(f"--{arg}") + benchmark_cmd.append(str(val)) + subprocess.run(benchmark_cmd, check=True) + + # Sleep for experiment duration + # print(f"Sleeping for {exp['sleep']} seconds...") + # time.sleep(exp['sleep']) + + # Stop docker compose + subprocess.run(["docker", "compose", "down"], check=True) + + print(f"Experiment completed.") + +print("All experiments completed.") From f9b27a0503e99e7d9a8fe778253cd7272d3e38a8 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 18 Mar 2025 10:47:24 +0100 Subject: [PATCH 04/37] Add monitoring --- deathstar_hotel_reservation/demo.py | 2 +- deathstar_movie_review/demo.py | 2 +- deathstar_movie_review/start_benchmark.py | 33 ++++++++------ docker-compose.yml | 52 +++++++++++++++++++++-- prometheus.yml | 13 ++++++ run_experiments.py | 52 ++++++++++------------- run_experiments_gil_workaround.py | 27 ++++++++++-- src/cascade/runtime/flink_runtime.py | 49 +++++++++++++-------- 8 files changed, 159 insertions(+), 71 deletions(-) create mode 100644 prometheus.yml diff --git a/deathstar_hotel_reservation/demo.py b/deathstar_hotel_reservation/demo.py index b54d643..63a6024 100644 --- a/deathstar_hotel_reservation/demo.py +++ b/deathstar_hotel_reservation/demo.py @@ -268,7 +268,7 @@ def user_login_workload_generator(): def benchmark_runner(proc_num) -> dict[int, dict]: print(f'Generator: {proc_num} starting') client = FlinkClientSync("deathstar", "ds-out", "localhost:9092", True) - deathstar_generator = user_login_workload_generator() + deathstar_generator = deathstar_workload_generator() start = timer() for _ in range(bursts): diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 4b05261..893facf 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -36,7 +36,7 @@ def create_topics(*required_topics): print(f"Creating missing topics: {missing_topics}") # Define new topics (default: 1 partition, replication factor 1) - new_topics = [NewTopic(topic, num_partitions=1, replication_factor=1) for topic in missing_topics] + new_topics = [NewTopic(topic, num_partitions=32, replication_factor=1) for topic in missing_topics] # Create topics futures = admin_client.create_topics(new_topics) diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index 088bce9..e36e561 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -1,4 +1,5 @@ import hashlib +from multiprocessing import Pool import time from typing import Literal import uuid @@ -101,7 +102,8 @@ def deathstar_workload_generator(op): c += 1 -def benchmark_runner(proc_num, op, messages_per_burst, sleeps_per_burst, sleep_time, seconds_per_burst, bursts) -> dict[int, dict]: +def benchmark_runner(args) -> dict[int, dict]: + proc_num, op, messages_per_burst, sleeps_per_burst, sleep_time, seconds_per_burst, bursts = args print(f'Generator: {proc_num} starting') client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) deathstar_generator = deathstar_workload_generator(op) @@ -194,6 +196,7 @@ def main(): parser.add_argument("--seconds_per_burst", type=int, default=1, help="Seconds per burst") parser.add_argument("--bursts", type=int, default=100, help="Number of bursts") parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") + parser.add_argument("--no_init", action="store_true", help="Don't populate") args = parser.parse_args() EXPERIMENT = args.experiment @@ -214,22 +217,26 @@ def main(): init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) - - print("Populating...") - populate_user(init_client) - populate_movie(init_client) - init_client.producer.flush() - wait_for_futures(init_client) - print("Done.") - time.sleep(1) + + if not args.no_init: + print("Populating...") + populate_user(init_client) + populate_movie(init_client) + init_client.producer.flush() + wait_for_futures(init_client) + print("Done.") + time.sleep(1) print("Starting benchmark") - # with Pool(threads) as p: - # results = p.map(benchmark_runner, range(threads)) - # results = {k: v for d in results for k, v in d.items()} - results = benchmark_runner(0, frontend_op, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) + threads = 1 + func_args = [(t, frontend_op, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) for t in range(threads)] + with Pool(threads) as p: + results = p.map(benchmark_runner, func_args) + + results = {k: v for d in results for k, v in d.items()} + # results = benchmark_runner(0, frontend_op, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) print("last result:") print(list(results.values())[-1]) diff --git a/docker-compose.yml b/docker-compose.yml index 22b5bb8..94e8e62 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,13 +20,19 @@ services: KAFKA_PROCESS_ROLES: broker,controller KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9091 - - # Listener to use for broker-to-broker communication KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER # Required for a single node cluster KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + # Low Latency Tuning + KAFKA_NUM_NETWORK_THREADS: 8 + KAFKA_NUM_IO_THREADS: 16 + KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 + KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 + KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 + KAFKA_SOCKET_RECEIVE_BUFFER_BYTES: 102400 + # Change timestamp type for benchmark measurements KAFKA_LOG_MESSAGE_TIMESTAMP_TYPE: LogAppendTime @@ -49,16 +55,23 @@ services: dockerfile: Dockerfile.pyflink ports: - "8081:8081" + expose: + - "9250" # Metrics port command: jobmanager environment: - | FLINK_PROPERTIES= - jobmanager.rpc.address: jobmanager + jobmanager.rpc.address: jobmanager + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + metrics.reporter.prom.port: 9250 + metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory taskmanager: build: context: . dockerfile: Dockerfile.pyflink + expose: + - "9250" # Metrics port depends_on: - jobmanager command: taskmanager @@ -67,4 +80,35 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} \ No newline at end of file + taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + metrics.reporter.prom.port: 9250 + metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory + + + # Monitoring stack + prometheus: + image: prom/prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + depends_on: + - jobmanager + - taskmanager + + grafana: + image: grafana/grafana + ports: + - "3000:3000" + volumes: + - grafana-storage:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + depends_on: + - prometheus + +volumes: + grafana-storage: \ No newline at end of file diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..6503113 --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,13 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'flink-jobmanager' + static_configs: + - targets: ['jobmanager:9250'] + + - job_name: 'flink-taskmanagers' + dns_sd_configs: + - names: ['taskmanager'] + type: A + port: 9250 \ No newline at end of file diff --git a/run_experiments.py b/run_experiments.py index 3c1137c..58934c0 100755 --- a/run_experiments.py +++ b/run_experiments.py @@ -5,7 +5,7 @@ args = { "messages_per_burst": 10, "sleeps_per_burst": 10, - "sleep_time": 0.08, + "sleep_time": 0.09, "seconds_per_burst": 1, "bursts": 100 } @@ -14,58 +14,50 @@ **args, "messages_per_burst": 1, "sleeps_per_burst": 1, - "sleep_time": 0.8, + "sleep_time": 0.9, } mps_20 = { **args, "messages_per_burst": 20, "sleeps_per_burst": 20, - "sleep_time": 0.08/2, + "sleep_time": 0.09/2, +} + +mps_30 = { + **args, + "messages_per_burst": 30, + "sleeps_per_burst": 30, + "sleep_time": 0.09/3, } mps_50 = { **args, "messages_per_burst": 50, "sleeps_per_burst": 50, - "sleep_time": 0.08/5, + "sleep_time": 0.09/5, } + # Define experiment parameters as a list of dictionaries experiments = [ - {"parallelism": 16, "benchmark_args": {**mps_1}}, - {"parallelism": 16, "benchmark_args": {**args}}, - - {"parallelism": 8, "benchmark_args": {**mps_1}}, - {"parallelism": 8, "benchmark_args": {**args}}, - - {"parallelism": 4, "benchmark_args": {**mps_1}}, - {"parallelism": 4, "benchmark_args": {**args}}, - - {"parallelism": 2, "benchmark_args": {**mps_1}}, - {"parallelism": 2, "benchmark_args": {**args}}, + # {"parallelism": 16, "benchmark_args": {**args}}, + # {"parallelism": 8, "benchmark_args": {**args}}, + # {"parallelism": 4, "benchmark_args": {**args}}, + # {"parallelism": 2, "benchmark_args": {**args}}, + # {"parallelism": 1, "benchmark_args": {**args}}, - {"parallelism": 1, "benchmark_args": {**mps_1}}, - {"parallelism": 1, "benchmark_args": {**args}}, # {"parallelism": 16, "benchmark_args": {**mps_20}}, - # {"parallelism": 16, "benchmark_args": {**mps_50}}, - - # {"parallelism": 8, "benchmark_args": {**args}}, # {"parallelism": 8, "benchmark_args": {**mps_20}}, - # {"parallelism": 4, "benchmark_args": {**mps_20}}, - # {"parallelism": 4, "benchmark_args": {**args}}, - - # {"parallelism": 2, "benchmark_args": {**args}}, # {"parallelism": 2, "benchmark_args": {**mps_20}}, - - # {"parallelism": 1, "benchmark_args": {**args}}, # {"parallelism": 1, "benchmark_args": {**mps_20}}, - # {"parallelism": 8, "benchmark_args": {**mps_50}}, - # {"parallelism": 4, "benchmark_args": {**mps_50}}, - # {"parallelism": 2, "benchmark_args": {**mps_50}}, - # {"parallelism": 1, "benchmark_args": {**mps_50}}, + {"parallelism": 16, "benchmark_args": {**mps_50}}, + {"parallelism": 8, "benchmark_args": {**mps_50}}, + {"parallelism": 4, "benchmark_args": {**mps_50}}, + {"parallelism": 2, "benchmark_args": {**mps_50}}, + {"parallelism": 1, "benchmark_args": {**mps_50}}, ] diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index a890c80..cda506a 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -52,6 +52,14 @@ "sleep_time": 0.09/50, } +def mps(num): + return { + **args, + "messages_per_burst": num, + "sleeps_per_burst": num, + "sleep_time": 0.9/num + } + # Define experiment parameters as a list of dictionaries experiments = [ @@ -66,8 +74,19 @@ # {"parallelism": 4, "benchmark_args": {**mps_20}}, # {"parallelism": 2, "benchmark_args": {**mps_20}}, # {"parallelism": 1, "benchmark_args": {**mps_20}}, - - {"parallelism": 16, "benchmark_args": {**mps_500}}, + {"parallelism": 4, "benchmark_args": {**mps(20)}}, + {"parallelism": 4, "benchmark_args": {**mps(40)}}, + {"parallelism": 4, "benchmark_args": {**mps(60)}}, + {"parallelism": 4, "benchmark_args": {**mps(80)}}, + {"parallelism": 4, "benchmark_args": {**mps(100)}}, + # {"parallelism": 4, "benchmark_args": {**mps(300)}}, + + # {"parallelism": 24, "benchmark_args": {**mps(200)}}, + # {"parallelism": 24, "benchmark_args": {**mps(400)}}, + # {"parallelism": 24, "benchmark_args": {**mps(600)}}, + # {"parallelism": 24, "benchmark_args": {**mps(800)}}, + # {"parallelism": 24, "benchmark_args": {**mps(1000)}}, + # {"parallelism": 16, "benchmark_args": {**mps(100)}}, # {"parallelism": 32, "benchmark_args": {**mps_500}}, # {"parallelism": 8, "benchmark_args": {**mps_50}}, # {"parallelism": 4, "benchmark_args": {**mps_50}}, @@ -81,8 +100,8 @@ print("Tearing down docker containers") subprocess.run(["docker", "compose", "down"], check=True) -# for e in ["pipelined", "parallel", "baseline"]: -for e in ["parallel"]: +for e in ["pipelined", "parallel", "baseline"]: +# for e in ["parallel"]: for exp in experiments: print(f"Starting experiment {exp}") diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index be1925b..c0e66cf 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -28,7 +28,7 @@ SELECT_ALL_ENABLED = False # Add profiling information to metadata -PROFILE = True +PROFILE = False @dataclass class FlinkRegisterKeyNode(Node): @@ -388,12 +388,15 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_integer("python.fn-execution.bundle.size", bundle_size) config.set_string("python.execution-mode", "thread") - config.set_boolean("python.metric.enabled", False) + + # METRICS + config.set_boolean("python.metric.enabled", True) + config.set_string("metrics.latency.interval", "500 ms") + config.set_boolean("state.latency-track.keyed-state-enabled", True) + config.set_boolean("taskmanager.network.detailed-metrics", True) # optimize for low latency - # config.set_integer("taskmanager.memory.managed.size", 0) config.set_string("execution.batch-shuffle-mode", "ALL_EXCHANGES_PIPELINED") - # config.set_integer("execution.buffer-timeout.interval", 0) config.set_string("execution.buffer-timeout", "0 ms") @@ -408,7 +411,8 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para self.env = StreamExecutionEnvironment.get_execution_environment(config) if parallelism: self.env.set_parallelism(parallelism) - logger.debug(f"FlinkRuntime: parellelism {self.env.get_parallelism()}") + parallelism = self.env.get_parallelism() + logger.debug(f"FlinkRuntime: parellelism {parallelism}") deserialization_schema = ByteSerializer() @@ -429,11 +433,16 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para .set_group_id("test_group_1") .set_starting_offsets(KafkaOffsetsInitializer.earliest()) .set_value_only_deserializer(deserialization_schema) + .set_property("fetch.min.bytes", "1") + .set_property("max.partition.fetch.bytes", "1048576") + .set_property("enable.auto.commit", "false") .build() ) self.kafka_internal_sink = ( KafkaSink.builder() .set_bootstrap_servers(kafka_broker) + .set_property("linger.ms", "0") + .set_property("acks", "1") .set_record_serializer( KafkaRecordSerializationSchema.builder() .set_topic(self.internal_topic) @@ -464,17 +473,18 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para "Kafka External Source" ) .map(lambda x: deserialize_and_timestamp(x)) + .set_parallelism(parallelism=max(parallelism//4, 1)) .name("DESERIALIZE external") # .filter(lambda e: isinstance(e, Event)) # Enforced by `send` type safety ).union( self.env.from_source( kafka_internal_source, WatermarkStrategy.no_watermarks(), - "Kafka External Source" + "Kafka Internal Source" ) .map(lambda x: deserialize_and_timestamp(x)) .name("DESERIALIZE internal") - ).map(lambda e: profile_event(e, "DESERIALIZE DONE")) + )#.map(lambda e: profile_event(e, "DESERIALIZE DONE")) # Events with a `SelectAllNode` will first be processed by the select # all operator, which will send out multiple other Events that can @@ -494,6 +504,7 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para event_stream = select_all_stream.union(not_select_all_stream) + # event_stream = event_stream.disable_chaining() self.stateful_op_stream = event_stream self.stateless_op_stream = event_stream @@ -510,13 +521,14 @@ def add_operator(self, op: StatefulOperator): op_stream = ( self.stateful_op_stream - .map(lambda e: profile_event(e, "STATEFUL OP FILTER: " + flink_op.operator.entity.__name__)) + # .map(lambda e: profile_event(e, "STATEFUL OP FILTER: " + flink_op.operator.entity.__name__)) .filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) - .map(lambda e: profile_event(e, "STATEFUL OP ENTRY: " + flink_op.operator.entity.__name__)) + # .disable_chaining() + # .map(lambda e: profile_event(e, "STATEFUL OP ENTRY: " + flink_op.operator.entity.__name__)) .key_by(lambda e: e.variable_map[e.target.read_key_from]) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.entity.__name__) - ).map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) + )#.map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) self.stateful_op_streams.append(op_stream) def add_stateless_operator(self, op: StatelessOperator): @@ -525,12 +537,13 @@ def add_stateless_operator(self, op: StatelessOperator): op_stream = ( self.stateless_op_stream - .map(lambda e: profile_event(e, "STATELESS OP FILTER: " + flink_op.operator.dataflow.name)) + # .map(lambda e: profile_event(e, "STATELESS OP FILTER: " + flink_op.operator.dataflow.name)) .filter(lambda e: isinstance(e.target, StatelessOpNode) and e.target.operator.dataflow.name == flink_op.operator.dataflow.name) - .map(lambda e: profile_event(e, "STATELESS OP ENTRY: " + flink_op.operator.dataflow.name)) + # .disable_chaining() + # .map(lambda e: profile_event(e, "STATELESS OP ENTRY: " + flink_op.operator.dataflow.name)) .process(flink_op) .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) - ).map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) + )#.map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) self.stateless_op_streams.append(op_stream) @@ -547,11 +560,11 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka if len(self.stateful_op_streams) >= 1: s1 = self.stateful_op_streams[0] rest = self.stateful_op_streams[1:] - operator_streams = s1.union(*rest, *self.stateless_op_streams).map(lambda e: profile_event(e, "OP STREAM UNION")) + operator_streams = s1.union(*rest, *self.stateless_op_streams)#.map(lambda e: profile_event(e, "OP STREAM UNION")) elif len(self.stateless_op_streams) >= 1: s1 = self.stateless_op_streams[0] rest = self.stateless_op_streams[1:] - operator_streams = s1.union(*rest, *self.stateful_op_streams).map(lambda e: profile_event(e, "OP STREAM UNION")) + operator_streams = s1.union(*rest, *self.stateful_op_streams)#.map(lambda e: profile_event(e, "OP STREAM UNION")) else: raise RuntimeError("No operators found, were they added to the flink runtime with .add_*_operator()") @@ -564,14 +577,14 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" # union with EventResults or Events that don't have a CollectNode target - ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode)))).map(lambda e: profile_event(e, "MERGE UNION")) + ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode))))#.map(lambda e: profile_event(e, "MERGE UNION")) # Output the stream results = ( ds .filter(lambda e: isinstance(e, EventResult)) - .map(lambda e: profile_event(e, "EXTERNAL SINK")) + # .map(lambda e: profile_event(e, "EXTERNAL SINK")) .map(lambda e: timestamp_result(e)) ) if output == "collect": @@ -586,7 +599,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka ds_internal = ( ds .filter(lambda e: isinstance(e, Event)) - .map(lambda e: profile_event(e, "INTERNAL SINK")) + # .map(lambda e: profile_event(e, "INTERNAL SINK")) .map(lambda e: timestamp_event(e)) .sink_to(self.kafka_internal_sink) .name("INTERNAL KAFKA SINK") From 46a950c8d1dba820ae60e6ed5e29f442e84ef8bb Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 18 Mar 2025 12:05:50 +0100 Subject: [PATCH 05/37] test side outputs --- .../test_movie_review_demo.py | 5 ++ src/cascade/dataflow/dataflow.py | 4 +- src/cascade/dataflow/operator.py | 6 ++ src/cascade/runtime/flink_runtime.py | 77 +++++++++++++++---- 4 files changed, 74 insertions(+), 18 deletions(-) diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 27cc6f7..36892ec 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -1,3 +1,8 @@ +import sys +import os + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) + from cascade.dataflow.dataflow import Event, InitClass, InvokeMethod, OpNode from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index bb5704b..5924612 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -9,7 +9,9 @@ class Operator(ABC): - pass + @abstractmethod + def name(self) -> str: + pass @dataclass class InitClass: diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index 56d3e45..091f3cf 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -107,6 +107,9 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ return self._methods[method.method_name](variable_map=variable_map, state=state) + + def name(self): + return self.entity.__name__ class StatelessMethodCall(Protocol): @@ -130,4 +133,7 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ return self._methods[method.method_name](variable_map=variable_map) + + def name(self) -> str: + return self.dataflow.name diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index c0e66cf..9a9f523 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -11,6 +11,7 @@ from pyflink.datastream.functions import KeyedProcessFunction, RuntimeContext, ValueState, ValueStateDescriptor from pyflink.datastream.connectors.kafka import KafkaOffsetsInitializer, KafkaRecordSerializationSchema, KafkaSource, KafkaSink from pyflink.datastream import ProcessFunction, StreamExecutionEnvironment +from pyflink.datastream.output_tag import OutputTag import pickle from cascade.dataflow.dataflow import CollectNode, CollectTarget, Event, EventResult, Filter, InitClass, InvokeMethod, Node, OpNode, SelectAllNode, StatelessOpNode from cascade.dataflow.operator import StatefulOperator, StatelessOperator @@ -28,7 +29,7 @@ SELECT_ALL_ENABLED = False # Add profiling information to metadata -PROFILE = False +PROFILE = True @dataclass class FlinkRegisterKeyNode(Node): @@ -46,6 +47,28 @@ def propogate(self, event: Event, targets: list[Node], result: Any, **kwargs) -> """A key registration event does not propogate.""" return [] +class FanOutOperator(ProcessFunction): + """""" + # def __init__(self, stateless_ops: dict[str, OutputTag], stateful_ops: dict[str, OutputTag]) -> None: + # self.stateless_ops = stateless_ops + # self.stateful_ops = stateful_ops + + def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + event = profile_event(event, "FanOut") + + if isinstance(event.target, StatelessOpNode): + tag = OutputTag(event.target.operator.name()) + yield tag, event + elif isinstance(event.target, OpNode): + tag = OutputTag(event.target.entity.__name__) + else: + logger.error(f"FanOut: Wrong target: {event}") + return + + logger.debug(f"Fanout: {tag.tag_id}") + yield tag, event + + class FlinkOperator(KeyedProcessFunction): """Wraps an `cascade.dataflow.datflow.StatefulOperator` in a KeyedProcessFunction so that it can run in Flink. """ @@ -504,9 +527,10 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para event_stream = select_all_stream.union(not_select_all_stream) - # event_stream = event_stream.disable_chaining() - self.stateful_op_stream = event_stream - self.stateless_op_stream = event_stream + # # event_stream = event_stream.disable_chaining() + # self.stateful_op_stream = event_stream + # self.stateless_op_stream = event_stream + self.event_stream = event_stream.process(FanOutOperator()) self.stateless_op_streams = [] @@ -519,31 +543,50 @@ def add_operator(self, op: StatefulOperator): """Add a `FlinkOperator` to the Flink datastream.""" flink_op = FlinkOperator(op) + tag = OutputTag(op.name()) + op_stream = ( - self.stateful_op_stream - # .map(lambda e: profile_event(e, "STATEFUL OP FILTER: " + flink_op.operator.entity.__name__)) - .filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) - # .disable_chaining() - # .map(lambda e: profile_event(e, "STATEFUL OP ENTRY: " + flink_op.operator.entity.__name__)) + self.event_stream + .get_side_output(tag) .key_by(lambda e: e.variable_map[e.target.read_key_from]) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.entity.__name__) - )#.map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) + ).map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) + # self.stateful_op_tags.append(tag) + + # op_stream = ( + # self.stateful_op_stream + # # .map(lambda e: profile_event(e, "STATEFUL OP FILTER: " + flink_op.operator.entity.__name__)) + # .filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) + # # .disable_chaining() + # # .map(lambda e: profile_event(e, "STATEFUL OP ENTRY: " + flink_op.operator.entity.__name__)) + # .key_by(lambda e: e.variable_map[e.target.read_key_from]) + # .process(flink_op) + # .name("STATEFUL OP: " + flink_op.operator.entity.__name__) + # )#.map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) self.stateful_op_streams.append(op_stream) def add_stateless_operator(self, op: StatelessOperator): """Add a `FlinkStatelessOperator` to the Flink datastream.""" flink_op = FlinkStatelessOperator(op) + tag = OutputTag(op.name()) op_stream = ( - self.stateless_op_stream - # .map(lambda e: profile_event(e, "STATELESS OP FILTER: " + flink_op.operator.dataflow.name)) - .filter(lambda e: isinstance(e.target, StatelessOpNode) and e.target.operator.dataflow.name == flink_op.operator.dataflow.name) - # .disable_chaining() - # .map(lambda e: profile_event(e, "STATELESS OP ENTRY: " + flink_op.operator.dataflow.name)) + self.event_stream + .get_side_output(tag) .process(flink_op) .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) - )#.map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) + ).map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) + # self.stateless_op_tags.append(tag) + # op_stream = ( + # self.stateless_op_stream + # # .map(lambda e: profile_event(e, "STATELESS OP FILTER: " + flink_op.operator.dataflow.name)) + # .filter(lambda e: isinstance(e.target, StatelessOpNode) and e.target.operator.dataflow.name == flink_op.operator.dataflow.name) + # # .disable_chaining() + # # .map(lambda e: profile_event(e, "STATELESS OP ENTRY: " + flink_op.operator.dataflow.name)) + # .process(flink_op) + # .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) + # )#.map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) self.stateless_op_streams.append(op_stream) @@ -577,7 +620,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" # union with EventResults or Events that don't have a CollectNode target - ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode))))#.map(lambda e: profile_event(e, "MERGE UNION")) + ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode)))).map(lambda e: profile_event(e, "MERGE UNION")) # Output the stream From 676cd71891a1e9a52e2b60920a2965a11ff474df Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 20 Mar 2025 15:05:34 +0100 Subject: [PATCH 06/37] Add monitoring switch --- docker-compose.monitoring.yml | 114 +++++++++++++++++++++++++++ docker-compose.yml | 40 +--------- run_experiments_gil_workaround.py | 10 +-- src/cascade/runtime/flink_runtime.py | 14 ++-- 4 files changed, 129 insertions(+), 49 deletions(-) create mode 100644 docker-compose.monitoring.yml diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml new file mode 100644 index 0000000..94e8e62 --- /dev/null +++ b/docker-compose.monitoring.yml @@ -0,0 +1,114 @@ +version: '3.1' + +# https://docs.docker.com/guides/kafka/ + +services: + kafka: + image: apache/kafka-native + ports: + - "9092:9092" # for HOST connections + expose: + - "9093" # for DOCKER connections + environment: + # Configure listeners for both docker and host communication + KAFKA_LISTENERS: CONTROLLER://localhost:9091,HOST://0.0.0.0:9092,DOCKER://0.0.0.0:9093 + KAFKA_ADVERTISED_LISTENERS: HOST://localhost:9092,DOCKER://kafka:9093 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,DOCKER:PLAINTEXT,HOST:PLAINTEXT + + # Settings required for KRaft mode + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9091 + KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER + + # Required for a single node cluster + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + + # Low Latency Tuning + KAFKA_NUM_NETWORK_THREADS: 8 + KAFKA_NUM_IO_THREADS: 16 + KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 + KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 + KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 + KAFKA_SOCKET_RECEIVE_BUFFER_BYTES: 102400 + + # Change timestamp type for benchmark measurements + KAFKA_LOG_MESSAGE_TIMESTAMP_TYPE: LogAppendTime + + kafka-ui: + image: ghcr.io/kafbat/kafka-ui:latest + ports: + - 8080:8080 + environment: + DYNAMIC_CONFIG_ENABLED: "true" + KAFKA_CLUSTERS_0_NAME: local + KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9093 + depends_on: + - kafka + + # https://nightlies.apache.org/flink/flink-docs-release-1.20/docs/deployment/resource-providers/standalone/docker/#flink-with-docker-compose + + jobmanager: + build: + context: . + dockerfile: Dockerfile.pyflink + ports: + - "8081:8081" + expose: + - "9250" # Metrics port + command: jobmanager + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + metrics.reporter.prom.port: 9250 + metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory + + taskmanager: + build: + context: . + dockerfile: Dockerfile.pyflink + expose: + - "9250" # Metrics port + depends_on: + - jobmanager + command: taskmanager + scale: 1 + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + metrics.reporter.prom.port: 9250 + metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory + + + # Monitoring stack + prometheus: + image: prom/prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + depends_on: + - jobmanager + - taskmanager + + grafana: + image: grafana/grafana + ports: + - "3000:3000" + volumes: + - grafana-storage:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + depends_on: + - prometheus + +volumes: + grafana-storage: \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 94e8e62..e77df7c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -55,23 +55,16 @@ services: dockerfile: Dockerfile.pyflink ports: - "8081:8081" - expose: - - "9250" # Metrics port command: jobmanager environment: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter - metrics.reporter.prom.port: 9250 - metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory taskmanager: build: context: . dockerfile: Dockerfile.pyflink - expose: - - "9250" # Metrics port depends_on: - jobmanager command: taskmanager @@ -80,35 +73,4 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} - metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter - metrics.reporter.prom.port: 9250 - metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory - - - # Monitoring stack - prometheus: - image: prom/prometheus - ports: - - "9090:9090" - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - command: - - '--config.file=/etc/prometheus/prometheus.yml' - depends_on: - - jobmanager - - taskmanager - - grafana: - image: grafana/grafana - ports: - - "3000:3000" - volumes: - - grafana-storage:/var/lib/grafana - environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - depends_on: - - prometheus - -volumes: - grafana-storage: \ No newline at end of file + taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} \ No newline at end of file diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index cda506a..46d0cc0 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -81,11 +81,11 @@ def mps(num): {"parallelism": 4, "benchmark_args": {**mps(100)}}, # {"parallelism": 4, "benchmark_args": {**mps(300)}}, - # {"parallelism": 24, "benchmark_args": {**mps(200)}}, - # {"parallelism": 24, "benchmark_args": {**mps(400)}}, - # {"parallelism": 24, "benchmark_args": {**mps(600)}}, - # {"parallelism": 24, "benchmark_args": {**mps(800)}}, - # {"parallelism": 24, "benchmark_args": {**mps(1000)}}, + {"parallelism": 24, "benchmark_args": {**mps(200)}}, + {"parallelism": 24, "benchmark_args": {**mps(400)}}, + {"parallelism": 24, "benchmark_args": {**mps(600)}}, + {"parallelism": 24, "benchmark_args": {**mps(800)}}, + {"parallelism": 24, "benchmark_args": {**mps(1000)}}, # {"parallelism": 16, "benchmark_args": {**mps(100)}}, # {"parallelism": 32, "benchmark_args": {**mps_500}}, # {"parallelism": 8, "benchmark_args": {**mps_50}}, diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index c0e66cf..4838667 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -18,7 +18,7 @@ import logging logger = logging.getLogger(__name__) -logger.setLevel(1) +logger.setLevel("WARNING") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) @@ -30,6 +30,9 @@ # Add profiling information to metadata PROFILE = False +# Enable latency metrics +METRICS = False + @dataclass class FlinkRegisterKeyNode(Node): """A node that will register a key with the SelectAll operator. @@ -390,10 +393,11 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_string("python.execution-mode", "thread") # METRICS - config.set_boolean("python.metric.enabled", True) - config.set_string("metrics.latency.interval", "500 ms") - config.set_boolean("state.latency-track.keyed-state-enabled", True) - config.set_boolean("taskmanager.network.detailed-metrics", True) + if METRICS: + config.set_boolean("python.metric.enabled", True) + config.set_string("metrics.latency.interval", "500 ms") + config.set_boolean("state.latency-track.keyed-state-enabled", True) + config.set_boolean("taskmanager.network.detailed-metrics", True) # optimize for low latency config.set_string("execution.batch-shuffle-mode", "ALL_EXCHANGES_PIPELINED") From ace6b520597ba22cd6540d88668db9f54ad137ea Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:55:04 +0100 Subject: [PATCH 07/37] Finalize experiments --- deathstar_movie_review/entities/frontend.py | 67 ++---------- deathstar_movie_review/entities/text.py | 29 +++++ deathstar_movie_review/entities/unique_id.py | 32 ++++++ deathstar_movie_review/start_benchmark.py | 35 +++--- docker-compose.monitoring.yml | 4 +- run_experiments_gil_workaround.py | 106 +++++-------------- src/cascade/dataflow/dataflow.py | 9 +- src/cascade/runtime/flink_runtime.py | 106 +++++++++---------- 8 files changed, 171 insertions(+), 217 deletions(-) create mode 100644 deathstar_movie_review/entities/text.py create mode 100644 deathstar_movie_review/entities/unique_id.py diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py index da1976f..fce328f 100644 --- a/deathstar_movie_review/entities/frontend.py +++ b/deathstar_movie_review/entities/frontend.py @@ -1,82 +1,29 @@ -import os -from typing import Any, Literal -import uuid +from typing import Any from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode, StatelessOpNode from cascade.dataflow.operator import StatelessOperator -from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from deathstar_movie_review.entities.compose_review import ComposeReview from deathstar_movie_review.entities.movie import MovieId +from deathstar_movie_review.entities.unique_id import UniqueId, unique_id_op from deathstar_movie_review.entities.user import User +from deathstar_movie_review.entities.text import Text, text_op -# unique_id is stateless -class UniqueId(): - @staticmethod - def upload_unique_id_2(review: ComposeReview): - review_id = uuid.uuid1().int >> 64 - review.upload_unique_id(review_id) - -# text is stateless -class Text(): - @staticmethod - def upload_text_2(review: ComposeReview, text: str): - review.upload_text(text) - CHAR_LIMIT = 50 # frontend is made stateless class Frontend(): @staticmethod def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text: str): - - # dead node elimination will remove "returning back" to the original function - # - # cascade could theoritically allow for more advanced analysis, - # that would enable all these to run in parallel. However, this is only - # possible because - # 1. the individual functions don't depend on each other - # 2. the ordering of side-effects does not matter UniqueId.upload_unique_id_2(review) user.upload_user(review) title.upload_movie(review, rating) - - text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! + # text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! Text.upload_text_2(review, text) -###### COMPILED FUNCTIONS ###### - -### UPLOAD UNIQUE ### - -def upload_unique_compiled_0(variable_map: dict[str, Any]): - variable_map["review_id"] = uuid.uuid1().int >> 64 - -unique_id_op = StatelessOperator( - { - "upload_unique": upload_unique_compiled_0, - }, - None -) - -df = DataFlow("upload_unique_id") -n0 = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) -n1 = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") -df.entry = n0 -unique_id_op.dataflow = df -### TEXT ### -text_op = StatelessOperator( - {}, - None -) - -df = DataFlow("upload_text") -n0 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") -df.entry = n0 -text_op.dataflow = df - -### FRONTEND ### +###### COMPILED FUNCTIONS ###### def compose_compiled_0(variable_map: dict[str, Any]): pass @@ -115,6 +62,7 @@ def frontend_df_serial(): n6 = StatelessOpNode(frontend_op, InvokeMethod("empty")) # Upload Text DF + n7a = StatelessOpNode(text_op, InvokeMethod("upload_text_2")) n7 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") n8 = StatelessOpNode(frontend_op, InvokeMethod("empty")) @@ -133,7 +81,8 @@ def frontend_df_serial(): df.add_edge(Edge(n5_b, n6)) df.add_edge(Edge(n5_c, n6)) - df.add_edge(Edge(n6, n7)) + df.add_edge(Edge(n6, n7a)) + df.add_edge(Edge(n7a, n7)) df.add_edge(Edge(n7, n8)) df.entry = n0 diff --git a/deathstar_movie_review/entities/text.py b/deathstar_movie_review/entities/text.py new file mode 100644 index 0000000..2bf2e69 --- /dev/null +++ b/deathstar_movie_review/entities/text.py @@ -0,0 +1,29 @@ +from typing import Any +from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode, StatelessOpNode +from cascade.dataflow.operator import StatelessOperator +from deathstar_movie_review.entities.compose_review import ComposeReview + +class Text(): + @staticmethod + def upload_text_2(review: ComposeReview, text: str): + review.upload_text(text) + + +###### COMPILED FUNCTIONS ###### + +def upload_text_2_compiled_0(variable_map: dict[str, Any]): + pass + +text_op = StatelessOperator( + { + "upload_text_2": upload_text_2_compiled_0 + }, + None +) + +df = DataFlow("upload_text") +n0 = StatelessOpNode(text_op, InvokeMethod("upload_text_2")) +n1 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") +df.add_edge(Edge(n0, n1)) +df.entry = n0 +text_op.dataflow = df \ No newline at end of file diff --git a/deathstar_movie_review/entities/unique_id.py b/deathstar_movie_review/entities/unique_id.py new file mode 100644 index 0000000..1ca9bfe --- /dev/null +++ b/deathstar_movie_review/entities/unique_id.py @@ -0,0 +1,32 @@ +from typing import Any +import uuid +from cascade.dataflow.dataflow import DataFlow, InvokeMethod, OpNode, StatelessOpNode +from cascade.dataflow.operator import StatelessOperator +from deathstar_movie_review.entities.compose_review import ComposeReview + + +class UniqueId(): + @staticmethod + def upload_unique_id_2(review: ComposeReview): + review_id = uuid.uuid1().int >> 64 + review.upload_unique_id(review_id) + + + +###### COMPILED FUNCTIONS ###### + +def upload_unique_compiled_0(variable_map: dict[str, Any]): + variable_map["review_id"] = uuid.uuid1().int >> 64 + +unique_id_op = StatelessOperator( + { + "upload_unique": upload_unique_compiled_0, + }, + None +) + +df = DataFlow("upload_unique_id") +n0 = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) +n1 = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") +df.entry = n0 +unique_id_op.dataflow = df diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index e36e561..cdff9b8 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -103,7 +103,7 @@ def deathstar_workload_generator(op): def benchmark_runner(args) -> dict[int, dict]: - proc_num, op, messages_per_burst, sleeps_per_burst, sleep_time, seconds_per_burst, bursts = args + proc_num, op, requests_per_second, sleep_time, bursts = args print(f'Generator: {proc_num} starting') client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) deathstar_generator = deathstar_workload_generator(op) @@ -113,11 +113,11 @@ def benchmark_runner(args) -> dict[int, dict]: sec_start = timer() # send burst of messages - for i in range(messages_per_burst): + for i in range(requests_per_second): # sleep sometimes between messages - if i % (messages_per_burst // sleeps_per_burst) == 0: - time.sleep(sleep_time) + # if i % (messages_per_burst // sleeps_per_burst) == 0: + time.sleep(sleep_time) event = next(deathstar_generator) client.send(event) @@ -126,13 +126,16 @@ def benchmark_runner(args) -> dict[int, dict]: # wait out the second lps = sec_end - sec_start - if lps < seconds_per_burst: + if lps < 1: time.sleep(1 - lps) sec_end2 = timer() print(f'Latency per burst: {sec_end2 - sec_start} ({b+1}/{bursts})') end = timer() - print(f'Average latency per burst: {(end - start) / bursts} ({seconds_per_burst})') + avg_send_latency = (end - start) / bursts + print(f'Average send latency per burst for generator {proc_num} was: {avg_send_latency}') + if avg_send_latency > 1.1: + print(f'This is higher than expected (1). Maybe increase the number of threads?') futures = wait_for_futures(client) client.close() return futures @@ -190,19 +193,21 @@ def write_dict_to_pkl(futures_dict, filename): def main(): parser = argparse.ArgumentParser(description="Run the benchmark and save results.") parser.add_argument("-o", "--output", type=str, default="benchmark_results.pkl", help="Output file name for the results") - parser.add_argument("--messages_per_burst", type=int, default=10, help="Number of messages per burst") - parser.add_argument("--sleeps_per_burst", type=int, default=10, help="Number of sleep cycles per burst") - parser.add_argument("--sleep_time", type=float, default=0.08, help="Sleep time between messages") - parser.add_argument("--seconds_per_burst", type=int, default=1, help="Seconds per burst") - parser.add_argument("--bursts", type=int, default=100, help="Number of bursts") + parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") + parser.add_argument("--seconds", type=int, default=100, help="Number of seconds to benchmark for") + parser.add_argument("--threads", type=int, default=1, help="Number of concurrent threads") parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") parser.add_argument("--no_init", action="store_true", help="Don't populate") args = parser.parse_args() + rps_per_thread = int(args.requests_per_second / args.threads) + sleep_time = 0.95 / rps_per_thread + EXPERIMENT = args.experiment print(f"Experiment [{EXPERIMENT}]") print(f"Starting with args:\n{args}") + print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") if EXPERIMENT == "baseline": @@ -229,14 +234,11 @@ def main(): print("Starting benchmark") - - threads = 1 - func_args = [(t, frontend_op, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) for t in range(threads)] - with Pool(threads) as p: + func_args = [(t, frontend_op, rps_per_thread, sleep_time, args.seconds) for t in range(args.threads)] + with Pool(args.threads) as p: results = p.map(benchmark_runner, func_args) results = {k: v for d in results for k, v in d.items()} - # results = benchmark_runner(0, frontend_op, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) print("last result:") print(list(results.values())[-1]) @@ -244,7 +246,6 @@ def main(): r = 0 for result in results.values(): if result["ret"] is not None: - # print(result) r += 1 print(f"{r}/{t} results recieved.") diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml index 94e8e62..3566235 100644 --- a/docker-compose.monitoring.yml +++ b/docker-compose.monitoring.yml @@ -26,8 +26,8 @@ services: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 # Low Latency Tuning - KAFKA_NUM_NETWORK_THREADS: 8 - KAFKA_NUM_IO_THREADS: 16 + KAFKA_NUM_NETWORK_THREADS: 16 + KAFKA_NUM_IO_THREADS: 32 KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index 46d0cc0..5db7811 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -7,98 +7,42 @@ "sleeps_per_burst": 10, "sleep_time": 0.09, "seconds_per_burst": 1, - "bursts": 100 + "seconds": 100 } -mps_1 = { - **args, - "messages_per_burst": 1, - "sleeps_per_burst": 1, - "sleep_time": 0.9, -} - -mps_20 = { - **args, - "messages_per_burst": 20, - "sleeps_per_burst": 20, - "sleep_time": 0.09/2, -} - -mps_30 = { - **args, - "messages_per_burst": 30, - "sleeps_per_burst": 30, - "sleep_time": 0.09/3, -} - -mps_50 = { - **args, - "messages_per_burst": 50, - "sleeps_per_burst": 50, - "sleep_time": 0.09/5, -} - -mps_100 = { - **args, - "messages_per_burst": 100, - "sleeps_per_burst": 100, - "sleep_time": 0.09/10, -} - -mps_500 = { - **args, - "messages_per_burst": 500, - "sleeps_per_burst": 500, - "sleep_time": 0.09/50, -} - -def mps(num): +def mps(num, producer_threads=1): return { - **args, - "messages_per_burst": num, - "sleeps_per_burst": num, - "sleep_time": 0.9/num + "threads": producer_threads, + "requests_per_second": num, + "seconds": 100, } # Define experiment parameters as a list of dictionaries experiments = [ - # {"parallelism": 16, "benchmark_args": {**args}}, - # {"parallelism": 8, "benchmark_args": {**args}}, - # {"parallelism": 4, "benchmark_args": {**args}}, - # {"parallelism": 2, "benchmark_args": {**args}}, - # {"parallelism": 1, "benchmark_args": {**args}}, - - # {"parallelism": 16, "benchmark_args": {**mps_20}}, - # {"parallelism": 8, "benchmark_args": {**mps_20}}, - # {"parallelism": 4, "benchmark_args": {**mps_20}}, - # {"parallelism": 2, "benchmark_args": {**mps_20}}, - # {"parallelism": 1, "benchmark_args": {**mps_20}}, - {"parallelism": 4, "benchmark_args": {**mps(20)}}, - {"parallelism": 4, "benchmark_args": {**mps(40)}}, - {"parallelism": 4, "benchmark_args": {**mps(60)}}, - {"parallelism": 4, "benchmark_args": {**mps(80)}}, - {"parallelism": 4, "benchmark_args": {**mps(100)}}, - # {"parallelism": 4, "benchmark_args": {**mps(300)}}, - - {"parallelism": 24, "benchmark_args": {**mps(200)}}, - {"parallelism": 24, "benchmark_args": {**mps(400)}}, - {"parallelism": 24, "benchmark_args": {**mps(600)}}, - {"parallelism": 24, "benchmark_args": {**mps(800)}}, - {"parallelism": 24, "benchmark_args": {**mps(1000)}}, - # {"parallelism": 16, "benchmark_args": {**mps(100)}}, - # {"parallelism": 32, "benchmark_args": {**mps_500}}, - # {"parallelism": 8, "benchmark_args": {**mps_50}}, - # {"parallelism": 4, "benchmark_args": {**mps_50}}, - # {"parallelism": 2, "benchmark_args": {**mps_50}}, - # {"parallelism": 1, "benchmark_args": {**mps_50}}, + # {"parallelism": 4, "benchmark_args": {**mps(20)}}, + # {"parallelism": 4, "benchmark_args": {**mps(40)}}, + # {"parallelism": 4, "benchmark_args": {**mps(60)}}, + # {"parallelism": 4, "benchmark_args": {**mps(80)}}, + # {"parallelism": 4, "benchmark_args": {**mps(100)}}, + + # {"parallelism": 24, "benchmark_args": {**mps(200)}}, + # {"parallelism": 24, "benchmark_args": {**mps(400)}}, + # {"parallelism": 24, "benchmark_args": {**mps(600)}}, + # {"parallelism": 24, "benchmark_args": {**mps(800)}}, + # {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, + # {"parallelism": 24, "benchmark_args": {**mps(400, producer_threads=10)}}, + # {"parallelism": 24, "benchmark_args": {**mps(600, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**mps(1000, producer_threads=20)}}, + # {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=40)}}, + # {"parallelism": 24, "benchmark_args": {**mps(1000, threads=20)}}, ] print("Tearing down docker containers") -subprocess.run(["docker", "compose", "down"], check=True) +subprocess.run(["docker", "compose", "down"], check=False) for e in ["pipelined", "parallel", "baseline"]: # for e in ["parallel"]: @@ -106,7 +50,7 @@ def mps(num): print(f"Starting experiment {exp}") # Start docker compose - subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}"], check=True, env={ + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}", "--force-recreate"], check=True, env={ "TASK_SLOTS": "1" }) @@ -123,7 +67,7 @@ def mps(num): subprocess.run(flink_cmd, check=True, env=env) # Start benchmark - filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['messages_per_burst']}.pkl" + filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['requests_per_second']}.pkl" benchmark_cmd = [ "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename, "--experiment", e ] @@ -138,7 +82,7 @@ def mps(num): # time.sleep(exp['sleep']) # Stop docker compose - subprocess.run(["docker", "compose", "down"], check=True) + subprocess.run(["docker", "compose", "down"], check=False) print(f"Experiment completed.") diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 5924612..56b494a 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import Any, Callable, List, Optional, Type, Union from typing import TYPE_CHECKING +import uuid if TYPE_CHECKING: # Prevent circular imports @@ -344,6 +345,7 @@ def to_dot(self) -> str: def generate_event(self, variable_map: dict[str, Any]) -> Union['Event', list['Event']]: if isinstance(self.entry, list): assert len(self.entry) != 0 + # give all the events the same id first_event = Event(self.entry[0], variable_map, self) id = first_event._id return [first_event] + [Event(entry, variable_map, self, _id=id) for entry in self.entry[1:]] @@ -387,7 +389,7 @@ class Event(): collect_target: Optional[CollectTarget] = field(default=None) """Tells each mergenode (key) how many events to merge on""" - _id_counter: int = field(init=False, default=0, repr=False) + # _id_counter: int = field(init=False, default=0, repr=False) metadata: dict = field(default_factory=metadata_dict) """Event metadata containing, for example, timestamps for benchmarking""" @@ -395,8 +397,9 @@ class Event(): def __post_init__(self): if self._id is None: # Assign a unique ID from the class-level counter - self._id = Event._id_counter - Event._id_counter += 1 + self._id = uuid.uuid4().int + # self._id = Event._id_counter + # Event._id_counter += 1 def propogate(self, result, select_all_keys: Optional[list[str]]=None) -> Union['EventResult', list['Event']]: """Propogate this event through the Dataflow.""" diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index c9ae1e7..ef22ea9 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -19,7 +19,7 @@ import logging logger = logging.getLogger(__name__) -logger.setLevel("WARNING") +logger.setLevel("INFO") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) @@ -29,7 +29,7 @@ SELECT_ALL_ENABLED = False # Add profiling information to metadata -PROFILE = True +PROFILE = False # Enable latency metrics METRICS = False @@ -52,18 +52,21 @@ def propogate(self, event: Event, targets: list[Node], result: Any, **kwargs) -> class FanOutOperator(ProcessFunction): """""" - # def __init__(self, stateless_ops: dict[str, OutputTag], stateful_ops: dict[str, OutputTag]) -> None: - # self.stateless_ops = stateless_ops - # self.stateful_ops = stateful_ops + def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, OutputTag]) -> None: + self.stateful_ops = stateful_ops + self.stateless_ops = stateless_ops def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): event = profile_event(event, "FanOut") + logger.debug("FanOut Enter") + if isinstance(event.target, StatelessOpNode): - tag = OutputTag(event.target.operator.name()) - yield tag, event + logger.debug(event.target.operator.name()) + tag = self.stateless_ops[event.target.operator.name()] elif isinstance(event.target, OpNode): - tag = OutputTag(event.target.entity.__name__) + logger.debug(event.target.entity.__name__) + tag = self.stateful_ops[event.target.entity.__name__] else: logger.error(f"FanOut: Wrong target: {event}") return @@ -534,11 +537,11 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para # # event_stream = event_stream.disable_chaining() # self.stateful_op_stream = event_stream # self.stateless_op_stream = event_stream - self.event_stream = event_stream.process(FanOutOperator()) + self.event_stream = event_stream - self.stateless_op_streams = [] - self.stateful_op_streams = [] + self.stateless_operators: list[FlinkStatelessOperator] = [] + self.stateful_operators: list[FlinkOperator] = [] """List of stateful operator streams, which gets appended at `add_operator`.""" logger.debug("FlinkRuntime initialized") @@ -547,52 +550,14 @@ def add_operator(self, op: StatefulOperator): """Add a `FlinkOperator` to the Flink datastream.""" flink_op = FlinkOperator(op) - tag = OutputTag(op.name()) - - op_stream = ( - self.event_stream - .get_side_output(tag) - .key_by(lambda e: e.variable_map[e.target.read_key_from]) - .process(flink_op) - .name("STATEFUL OP: " + flink_op.operator.entity.__name__) - ).map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) - # self.stateful_op_tags.append(tag) - - # op_stream = ( - # self.stateful_op_stream - # # .map(lambda e: profile_event(e, "STATEFUL OP FILTER: " + flink_op.operator.entity.__name__)) - # .filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) - # # .disable_chaining() - # # .map(lambda e: profile_event(e, "STATEFUL OP ENTRY: " + flink_op.operator.entity.__name__)) - # .key_by(lambda e: e.variable_map[e.target.read_key_from]) - # .process(flink_op) - # .name("STATEFUL OP: " + flink_op.operator.entity.__name__) - # )#.map(lambda e: profile_event(e, "STATEFUL OP EXIT: " + flink_op.operator.entity.__name__)) - self.stateful_op_streams.append(op_stream) + self.stateful_operators.append(flink_op) def add_stateless_operator(self, op: StatelessOperator): """Add a `FlinkStatelessOperator` to the Flink datastream.""" flink_op = FlinkStatelessOperator(op) - tag = OutputTag(op.name()) - op_stream = ( - self.event_stream - .get_side_output(tag) - .process(flink_op) - .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) - ).map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) - # self.stateless_op_tags.append(tag) - # op_stream = ( - # self.stateless_op_stream - # # .map(lambda e: profile_event(e, "STATELESS OP FILTER: " + flink_op.operator.dataflow.name)) - # .filter(lambda e: isinstance(e.target, StatelessOpNode) and e.target.operator.dataflow.name == flink_op.operator.dataflow.name) - # # .disable_chaining() - # # .map(lambda e: profile_event(e, "STATELESS OP ENTRY: " + flink_op.operator.dataflow.name)) - # .process(flink_op) - # .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) - # )#.map(lambda e: profile_event(e, "STATELESS OP EXIT: " + flink_op.operator.dataflow.name)) - - self.stateless_op_streams.append(op_stream) + self.stateless_operators.append(flink_op) + def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="kafka") -> Union[CloseableIterator, None]: """Start ingesting and processing messages from the Kafka source. @@ -601,7 +566,38 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka `cascade.dataflow.dataflow.EventResult`s.""" assert self.env is not None, "FlinkRuntime must first be initialised with `init()`." - logger.debug("FlinkRuntime merging operator streams...") + logger.info("FlinkRuntime merging operator streams...") + + # create the fanout operator + stateful_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateful_operators} + stateless_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateless_operators} + logger.debug(f"{stateful_tags.items()}") + fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags)).name("FANOUT OPERATOR").disable_chaining() + + # create the streams + self.stateful_op_streams = [] + for flink_op in self.stateful_operators: + tag = stateful_tags[flink_op.operator.name()] + op_stream = ( + fanout + .get_side_output(tag) + .key_by(lambda e: e.variable_map[e.target.read_key_from]) + .process(flink_op) + .name("STATEFUL OP: " + flink_op.operator.name()) + ) + self.stateful_op_streams.append(op_stream) + + + self.stateless_op_streams = [] + for flink_op in self.stateless_operators: + tag = stateless_tags[flink_op.operator.name()] + op_stream = ( + fanout + .get_side_output(tag) + .process(flink_op) + .name("STATELESS OP: " + flink_op.operator.name()) + ) + self.stateless_op_streams.append(op_stream) # Combine all the operator streams if len(self.stateful_op_streams) >= 1: @@ -653,11 +649,11 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka ) if run_async: - logger.debug("FlinkRuntime starting (async)") + logger.info("FlinkRuntime starting (async)") self.env.execute_async("Cascade: Flink Runtime") return ds_external # type: ignore (will be CloseableIterator provided the source is unbounded (i.e. Kafka)) else: - logger.debug("FlinkRuntime starting (sync)") + logger.info("FlinkRuntime starting (sync)") self.env.execute("Cascade: Flink Runtime") class FlinkClientSync: From c5f202a256d4cb9d0c2938933f4700d0809cc75d Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Mon, 31 Mar 2025 15:40:28 +0200 Subject: [PATCH 08/37] Rework Dataflow IR --- deathstar_hotel_reservation/test_demo.py | 198 ++++++------ deathstar_movie_review/entities/frontend.py | 7 +- deathstar_movie_review/entities/text.py | 7 +- deathstar_movie_review/entities/unique_id.py | 11 +- deathstar_movie_review/entities/user.py | 2 +- notebooks/dataflow_example.ipynb | 12 +- src/cascade/core.py | 30 +- src/cascade/dataflow/dataflow.py | 211 +++++++----- src/cascade/dataflow/operator.py | 56 +++- src/cascade/descriptors/class_descriptor.py | 6 + .../ast_visitors/extract_type_visitor.py | 6 +- .../dataflow_graph_builder.py | 4 +- .../frontend/generator/generate_dataflow.py | 27 +- .../generator/generate_split_functions.py | 140 ++++++-- .../frontend/generator/split_function.py | 114 +++++++ src/cascade/frontend/generator/unparser.py | 12 +- .../statement_level_dataflow_graph.py | 10 +- src/cascade/runtime/flink_runtime.py | 2 +- src/cascade/runtime/python_runtime.py | 110 +++++-- test_programs/expected/checkout_item.py | 16 +- test_programs/expected/checkout_two_items.py | 86 ----- .../expected/deathstar_recommendation.py | 112 ------- test_programs/expected/deathstar_search.py | 63 ---- test_programs/expected/deathstar_user.py | 57 ---- test_programs/target/checkout_two_items.py | 23 -- .../target/deathstar_recommendation.py | 20 -- test_programs/target/deathstar_search.py | 14 - test_programs/target/deathstar_user.py | 16 - .../test_dataflow_graph_builder.py | 23 +- .../dataflow_analysis/test_entities.py | 61 ++++ .../dataflow_analysis/test_split_functions.py | 72 +++++ .../flink-runtime/test_select_all.py | 302 +++++++++--------- tests/optimizations/test_parallelize.py | 232 ++++++++++++++ 33 files changed, 1229 insertions(+), 833 deletions(-) delete mode 100644 test_programs/expected/checkout_two_items.py delete mode 100644 test_programs/expected/deathstar_recommendation.py delete mode 100644 test_programs/expected/deathstar_search.py delete mode 100644 test_programs/expected/deathstar_user.py delete mode 100644 test_programs/target/checkout_two_items.py delete mode 100644 test_programs/target/deathstar_recommendation.py delete mode 100644 test_programs/target/deathstar_search.py delete mode 100644 test_programs/target/deathstar_user.py create mode 100644 tests/frontend/dataflow_analysis/test_entities.py create mode 100644 tests/frontend/dataflow_analysis/test_split_functions.py create mode 100644 tests/optimizations/test_parallelize.py diff --git a/deathstar_hotel_reservation/test_demo.py b/deathstar_hotel_reservation/test_demo.py index dea227f..05302ce 100644 --- a/deathstar_hotel_reservation/test_demo.py +++ b/deathstar_hotel_reservation/test_demo.py @@ -1,100 +1,100 @@ -import os -import sys - -# import cascade -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) - -from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -from cascade.runtime.flink_runtime import FlinkClientSync, FlinkRuntime -from deathstar_hotel_reservation.demo import DeathstarDemo, recommend, reserve, search_hotel, user_login -import time -import pytest - -@pytest.mark.integration -def test_deathstar_demo(): - ds = DeathstarDemo() - ds.init_runtime(FlinkRuntime("deathstardemo-test", "dsd-out")) - ds.runtime.run(run_async=True) - print("Populating, press enter to go to the next step when done") - ds.populate() - - client = FlinkClientSync("deathstardemo-test", "dsd-out") - input() - print("testing user login") - event = user_login() - client.send(event) - - input() - print("testing reserve") - event = reserve() - client.send(event) - - input() - print("testing search") - event = search_hotel() - client.send(event) - - input() - print("testing recommend (distance)") - time.sleep(0.5) - event = recommend(req_param="distance") - client.send(event) - - input() - print("testing recommend (price)") - time.sleep(0.5) - event = recommend(req_param="price") - client.send(event) - - print(client._futures) - input() - print("done!") - print(client._futures) - -def test_deathstar_demo_python(): - ds = DeathstarDemo() - ds.init_runtime(PythonRuntime()) - ds.runtime.run() - print("Populating, press enter to go to the next step when done") - ds.populate() - - time.sleep(0.1) - - client = PythonClientSync(ds.runtime) - print("testing user login") - event = user_login() - result = client.send(event) - assert result == True - event = user_login(succesfull=False) - result = client.send(event) - assert result == False - - print("testing reserve") - event = reserve() - result = client.send(event) - assert result == True - - return - print("testing search") - event = search_hotel() - result = client.send(event) - print(result) - - print("testing recommend (distance)") - time.sleep(0.5) - event = recommend(req_param="distance") - result = client.send(event) - print(result) - - print("testing recommend (price)") - time.sleep(0.5) - event = recommend(req_param="price") - result = client.send(event) - print(result) - - print("done!") - - -if __name__ == "__main__": - test_deathstar_demo() \ No newline at end of file +# import os +# import sys + +# # import cascade +# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) + +# from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +# from cascade.runtime.flink_runtime import FlinkClientSync, FlinkRuntime +# from deathstar_hotel_reservation.demo import DeathstarDemo, recommend, reserve, search_hotel, user_login +# import time +# import pytest + +# @pytest.mark.integration +# def test_deathstar_demo(): +# ds = DeathstarDemo() +# ds.init_runtime(FlinkRuntime("deathstardemo-test", "dsd-out")) +# ds.runtime.run(run_async=True) +# print("Populating, press enter to go to the next step when done") +# ds.populate() + +# client = FlinkClientSync("deathstardemo-test", "dsd-out") +# input() +# print("testing user login") +# event = user_login() +# client.send(event) + +# input() +# print("testing reserve") +# event = reserve() +# client.send(event) + +# input() +# print("testing search") +# event = search_hotel() +# client.send(event) + +# input() +# print("testing recommend (distance)") +# time.sleep(0.5) +# event = recommend(req_param="distance") +# client.send(event) + +# input() +# print("testing recommend (price)") +# time.sleep(0.5) +# event = recommend(req_param="price") +# client.send(event) + +# print(client._futures) +# input() +# print("done!") +# print(client._futures) + +# def test_deathstar_demo_python(): +# ds = DeathstarDemo() +# ds.init_runtime(PythonRuntime()) +# ds.runtime.run() +# print("Populating, press enter to go to the next step when done") +# ds.populate() + +# time.sleep(0.1) + +# client = PythonClientSync(ds.runtime) +# print("testing user login") +# event = user_login() +# result = client.send(event) +# assert result == True +# event = user_login(succesfull=False) +# result = client.send(event) +# assert result == False + +# print("testing reserve") +# event = reserve() +# result = client.send(event) +# assert result == True + +# return +# print("testing search") +# event = search_hotel() +# result = client.send(event) +# print(result) + +# print("testing recommend (distance)") +# time.sleep(0.5) +# event = recommend(req_param="distance") +# result = client.send(event) +# print(result) + +# print("testing recommend (price)") +# time.sleep(0.5) +# event = recommend(req_param="price") +# result = client.send(event) +# print(result) + +# print("done!") + + +# if __name__ == "__main__": +# test_deathstar_demo() \ No newline at end of file diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py index fce328f..88a71cf 100644 --- a/deathstar_movie_review/entities/frontend.py +++ b/deathstar_movie_review/entities/frontend.py @@ -30,10 +30,11 @@ def compose_compiled_0(variable_map: dict[str, Any]): frontend_op = StatelessOperator( + Frontend, { "empty": compose_compiled_0, }, - None + {} ) def frontend_df_serial(): @@ -85,7 +86,7 @@ def frontend_df_serial(): df.add_edge(Edge(n7a, n7)) df.add_edge(Edge(n7, n8)) - df.entry = n0 + df.entry = [n0] return df def frontend_df_parallel(): @@ -135,5 +136,5 @@ def frontend_df_parallel(): df.entry = [n1_a, n3_a, n5_a, n7] return df -frontend_op.dataflow = frontend_df_parallel() +frontend_op.dataflows["compose"] = frontend_df_parallel() diff --git a/deathstar_movie_review/entities/text.py b/deathstar_movie_review/entities/text.py index 2bf2e69..ebccf44 100644 --- a/deathstar_movie_review/entities/text.py +++ b/deathstar_movie_review/entities/text.py @@ -15,15 +15,16 @@ def upload_text_2_compiled_0(variable_map: dict[str, Any]): pass text_op = StatelessOperator( + Text, { "upload_text_2": upload_text_2_compiled_0 }, - None + {} ) df = DataFlow("upload_text") n0 = StatelessOpNode(text_op, InvokeMethod("upload_text_2")) n1 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") df.add_edge(Edge(n0, n1)) -df.entry = n0 -text_op.dataflow = df \ No newline at end of file +df.entry = [n0] +text_op.dataflows[df.name] = df \ No newline at end of file diff --git a/deathstar_movie_review/entities/unique_id.py b/deathstar_movie_review/entities/unique_id.py index 1ca9bfe..007db32 100644 --- a/deathstar_movie_review/entities/unique_id.py +++ b/deathstar_movie_review/entities/unique_id.py @@ -1,7 +1,7 @@ from typing import Any import uuid from cascade.dataflow.dataflow import DataFlow, InvokeMethod, OpNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator +from cascade.dataflow.operator import Block, StatelessOperator from deathstar_movie_review.entities.compose_review import ComposeReview @@ -19,14 +19,15 @@ def upload_unique_compiled_0(variable_map: dict[str, Any]): variable_map["review_id"] = uuid.uuid1().int >> 64 unique_id_op = StatelessOperator( + UniqueId, { - "upload_unique": upload_unique_compiled_0, + "upload_unique": Block(name="upload_unique", function_call=upload_unique_compiled_0, var_map_writes=["review_id"], var_map_reads=[]), }, - None + {} ) df = DataFlow("upload_unique_id") n0 = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) n1 = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") -df.entry = n0 -unique_id_op.dataflow = df +df.entry = [n0] +unique_id_op.dataflows[df.name] = df diff --git a/deathstar_movie_review/entities/user.py b/deathstar_movie_review/entities/user.py index e883277..c73511c 100644 --- a/deathstar_movie_review/entities/user.py +++ b/deathstar_movie_review/entities/user.py @@ -1,7 +1,7 @@ from typing import Any from deathstar_movie_review.entities.compose_review import ComposeReview from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator +from cascade.dataflow.operator import Block, StatefulOperator class User: diff --git a/notebooks/dataflow_example.ipynb b/notebooks/dataflow_example.ipynb index 099343e..90b3472 100644 --- a/notebooks/dataflow_example.ipynb +++ b/notebooks/dataflow_example.ipynb @@ -313,12 +313,12 @@ }, { "cell_type": "code", - "execution_count": 389, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from textwrap import indent\n", - "from cascade.frontend.generator.generate_split_functions import GenerateSplittFunctions\n", + "from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions\n", "from cascade.frontend.intermediate_representation import Block\n", "\n", "compiled_functions, df = GenerateSplittFunctions.generate_split_function_string(block_level_dataflow_graph)" @@ -485,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 456, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -504,7 +504,7 @@ } ], "source": [ - "split_functions = GenerateSplittFunctions.generate(dataflow_graph)\n", + "split_functions = GenerateSplitFunctions.generate(dataflow_graph)\n", "\n", "\n", "for i, split in enumerate(split_functions):\n", @@ -617,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 452, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -639,7 +639,7 @@ } ], "source": [ - "split_functions = GenerateSplittFunctions.generate(dataflow_graph)\n", + "split_functions = GenerateSplitFunctions.generate(dataflow_graph)\n", "\n", "\n", "for i, split in enumerate(split_functions):\n", diff --git a/src/cascade/core.py b/src/cascade/core.py index ae53c65..127a019 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -6,11 +6,12 @@ from klara.core.cfg import Cfg +from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator from cascade.wrappers import ClassWrapper from cascade.descriptors import ClassDescriptor, MethodDescriptor -from cascade.frontend.generator.generate_split_functions import GenerateSplittFunctions +from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions from cascade.frontend.generator.generate_dataflow import GenerateDataflow -from cascade.dataflow.dataflow import DataFlow +from cascade.dataflow.dataflow import DataFlow, Operator from cascade.frontend.intermediate_representation import StatementDataflowGraph from cascade.frontend.generator.build_compiled_method_string import BuildCompiledMethodsString from cascade.frontend.ast_visitors import ExtractTypeVisitor @@ -26,6 +27,7 @@ def setup_cfg(code: str) -> Cfg: registered_classes: list[ClassWrapper] = [] +operators: dict[str, Operator] = {} def cascade(cls, parse_file=True): if not isclass(cls): @@ -52,10 +54,32 @@ def cascade(cls, parse_file=True): registered_classes.append(class_wrapper) + +def build(method) -> tuple[DataFlow, list[Block]]: + # TODO: implement + pass + def init(): for cls in registered_classes: + op_name = cls.class_desc.class_name + + if cls.class_desc.is_stateless: + op = StatelessOperator(cls.cls, {}, {}) + else: + op = StatefulOperator(cls.cls, {}, {}) + + op: Operator = op + + # generate split functions for method in cls.class_desc.methods_dec: method.build_dataflow() + df, blocks = build(method) + op.dataflows[df.name] = df + for b in blocks: + op.methods[b.name] = b + + operators[op_name] = op + def get_entity_names() -> str: @@ -74,7 +98,7 @@ def get_compiled_methods() -> str: continue dataflow_graph: StatementDataflowGraph = method_desc.dataflow instance_type_map: dict[str, str] = ExtractTypeVisitor.extract(method_desc.method_node) - split_functions = GenerateSplittFunctions.generate(dataflow_graph, cls_desc.class_name, entities, instance_type_map) + split_functions = GenerateSplitFunctions.generate(dataflow_graph, cls_desc.class_name, entities, instance_type_map) df: DataFlow = GenerateDataflow.generate(split_functions, instance_type_map) class_compiled_methods: str = BuildCompiledMethodsString.build(split_functions) compiled_methods.append(class_compiled_methods) diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 56b494a..ed42f38 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -1,19 +1,29 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Any, Callable, List, Optional, Type, Union +from typing import Any, Callable, List, Mapping, Optional, Type, Union from typing import TYPE_CHECKING import uuid + if TYPE_CHECKING: # Prevent circular imports from cascade.dataflow.operator import StatelessOperator + from cascade.dataflow.operator import StatefulOperator + from cascade.dataflow.operator import Block class Operator(ABC): + dataflows: dict[str, 'DataFlow'] + methods: Mapping[str, 'Block'] + @abstractmethod def name(self) -> str: pass + def get_method_rw_set(self, method_name: str) -> tuple[list[str], list[str]]: + method = self.methods[method_name] + return method.var_map_reads, method.var_map_writes + @dataclass class InitClass: """A method type corresponding to an `__init__` call.""" @@ -27,10 +37,6 @@ class InvokeMethod: def __repr__(self) -> str: return f"{self.__class__.__name__}('{self.method_name}')" -@dataclass -class Filter: - """Filter by this function""" - filter_fn: Callable @dataclass class Node(ABC): @@ -57,8 +63,8 @@ class OpNode(Node): A `Dataflow` may reference the same entity multiple times. The `StatefulOperator` that this node belongs to is referenced by `entity`.""" - entity: Type - method_type: Union[InitClass, InvokeMethod, Filter] + operator: 'StatefulOperator' + method_type: Union[InitClass, InvokeMethod] read_key_from: str """Which variable to take as the key for this StatefulOperator""" @@ -143,51 +149,74 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['E return OpNode.propogate_opnode(self, event, targets, result) @dataclass -class DataflowNode(Node): +class DataflowRef: + operator_name: str + dataflow_name: str + + def get_dataflow(self) -> 'DataFlow': + operator: Operator = cascade.operators[operator_name] + return operator.dataflows[self.dataflow_name] + + def __repr__(self) -> str: + return f"{self.operator_name}.{self.dataflow_name}" + + def __hash__(self) -> int: + return hash(repr(self)) + + +@dataclass +class CallEntity(Node): """A node in a `DataFlow` corresponding to the call of another dataflow""" - dataflow: 'DataFlow' + dataflow: DataflowRef + """The dataflow to call.""" + variable_rename: dict[str, str] - + """A mapping of input variables (to the dataflow) to variables in the variable map""" + assign_result_to: Optional[str] = None """What variable to assign the result of this node to, if any.""" - is_conditional: bool = False - """Whether or not the boolean result of this node dictates the following path.""" - collect_target: Optional['CollectTarget'] = None - """Whether the result of this node should go to a CollectNode.""" + + key: Optional[str] = None + """The key, for calls to Stateful Entities""" def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['Event']: # remap the variable map of event into the new event - - # add the targets as some sort of dataflow "exit nodes" - return self.dataflow + new_var_map = {key: event.variable_map[value] for key, value in self.variable_rename.items()} + + df = self.dataflow.get_dataflow() + new_targets = df.entry + if not isinstance(new_targets, list): + new_targets = [new_targets] + + # targets: the list of targets to go to after this dataflow node + call = CallStackItem(event.dataflow, self.assign_result_to, event.variable_map, targets) + event.call_stack.append(call) + return [Event( + target, + new_var_map, + df, + _id=event._id, + metadata=event.metadata, + call_stack=event.call_stack) + + for target in new_targets] @dataclass -class SelectAllNode(Node): - """A node type that will yield all items of an entity filtered by - some function. - - Think of this as executing `SELECT * FROM cls`""" - cls: Type - collect_target: 'CollectNode' - assign_key_to: str - - def propogate(self, event: 'Event', targets: List[Node], result: Any, keys: list[str]): - targets = event.dataflow.get_neighbors(event.target) - assert len(targets) == 1 - n = len(keys) - collect_targets = [ - CollectTarget(self.collect_target, n, i) - for i in range(n) - ] +class CallLocal(Node): + method: Union[InvokeMethod, InitClass] + + def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: + # For simple calls, we only need to change the target. + # Multiple targets results in multiple events return [Event( - targets[0], - event.variable_map | {self.assign_key_to: key}, - event.dataflow, - _id=event._id, - collect_target=ct, - metadata=event.metadata) - for ct, key in zip(collect_targets, keys)] + target, + event.variable_map, + event.dataflow, + _id=event._id, + metadata=event.metadata) + + for target in targets] @dataclass class CollectNode(Node): @@ -195,22 +224,23 @@ class CollectNode(Node): It will aggregate incoming edges and output them as a list to the outgoing edge. Their actual implementation is runtime-dependent.""" - assign_result_to: str + assign_result_to: str = "" """The variable name in the variable map that will contain the collected result.""" - read_results_from: str + read_results_from: str = "" """The variable name in the variable map that the individual items put their result in.""" def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: - collect_targets = [event.collect_target for i in range(len(targets))] + # collect_targets = [event.collect_target for i in range(len(targets))] return [Event( target, event.variable_map, event.dataflow, _id=event._id, - collect_target=ct, + call_stack=event.call_stack, + # collect_target=ct, metadata=event.metadata) - for target, ct in zip(targets, collect_targets)] + for target in targets] @dataclass class Edge(): @@ -243,11 +273,17 @@ class DataFlow: collect-- [item1_price, item2_price] -->user2; ``` """ - def __init__(self, name: str): + # TODO: op should not be optional + def __init__(self, name: str, op_name: str=None, args: list[str]=None): self.name: str = name self.adjacency_list: dict[int, list[int]] = {} self.nodes: dict[int, Node] = {} - self.entry: Union[Node, List[Node]] = None + self.entry: List[Node] = [] + self.op_name = op_name + self.args = args + + def get_operator(self) -> Operator: + return cascade.ops[self.op_name] def add_node(self, node: Node): """Add a node to the Dataflow graph if it doesn't already exist.""" @@ -280,10 +316,10 @@ def remove_node(self, node: Node): return # Node doesn't exist in the graph - if isinstance(node, OpNode) or isinstance(node, StatelessOpNode): - assert not node.is_conditional, "there's no clear way to remove a conditional node" - assert not node.assign_result_to, "can't delete node whose result is used" - assert not node.collect_target, "can't delete node which has a collect_target" + # if isinstance(node, OpNode) or isinstance(node, StatelessOpNode): + # assert not node.is_conditional, "there's no clear way to remove a conditional node" + # assert not node.assign_result_to, "can't delete node whose result is used" + # assert not node.collect_target, "can't delete node which has a collect_target" # Find parents (nodes that have edges pointing to this node) parents = [parent_id for parent_id, children in self.adjacency_list.items() if node.id in children] @@ -292,10 +328,10 @@ def remove_node(self, node: Node): children = self.adjacency_list[node.id] # Set df entry - if self.entry == node: + if len(self.entry) == 1 and self.entry[0] == node: print(children) assert len(children) == 1, "cannot remove entry node if it doesn't exactly one child" - self.entry = self.nodes[children[0]] + self.entry = [self.nodes[children[0]]] # Connect each parent to each child for parent_id in parents: @@ -315,7 +351,6 @@ def remove_node(self, node: Node): child_node = self.nodes[child_id] self.remove_edge(node, child_node) - # Remove the node from the adjacency list and nodes dictionary del self.adjacency_list[node.id] @@ -326,6 +361,11 @@ def get_neighbors(self, node: Node) -> List[Node]: """Get the outgoing neighbors of this `Node`""" return [self.nodes[id] for id in self.adjacency_list.get(node.id, [])] + def get_predecessors(self, node: Node) -> List[Node]: + """Get the predeccors of this node by following incoming edges""" + return [self.nodes[id] for id, adj in self.adjacency_list.items() if node.id in adj] + + def to_dot(self) -> str: """Output the DataFlow graph in DOT (Graphviz) format.""" lines = [f"digraph {self.name} {{"] @@ -351,6 +391,9 @@ def generate_event(self, variable_map: dict[str, Any]) -> Union['Event', list['E return [first_event] + [Event(entry, variable_map, self, _id=id) for entry in self.entry[1:]] else: return Event(self.entry, variable_map, self) + + def __repr__(self) -> str: + return f"{self.op.name()}.{self.name}" @dataclass class CollectTarget: @@ -368,6 +411,14 @@ def metadata_dict() -> dict: "flink_time": 0 } +@dataclass +class CallStackItem: + dataflow: DataFlow + assign_result_to: Optional[str] + var_map: dict[str, str] + """Variables are saved in the call stack""" + targets: Union[Node, List[Node]] + @dataclass class Event(): """An Event is an object that travels through the Dataflow graph.""" @@ -379,46 +430,62 @@ class Event(): """A mapping of variable identifiers to values. If `target` is an `OpNode` this map should include the variables needed for that method.""" - dataflow: Optional['DataFlow'] + dataflow: DataFlow """The Dataflow that this event is a part of. If None, it won't propogate. This might be remove in the future in favour of a routing operator.""" _id: int = field(default=None) # type: ignore (will get updated in __post_init__ if unset) """Unique ID for this event. Except in `propogate`, this `id` should not be set.""" - collect_target: Optional[CollectTarget] = field(default=None) - """Tells each mergenode (key) how many events to merge on""" + # collect_target: Optional[CollectTarget] = field(default=None) + # """Tells each mergenode (key) how many events to merge on""" - # _id_counter: int = field(init=False, default=0, repr=False) + call_stack: List[CallStackItem] = field(default_factory=list) + """Target used when dataflow is done, used for recursive dataflows.""" metadata: dict = field(default_factory=metadata_dict) """Event metadata containing, for example, timestamps for benchmarking""" def __post_init__(self): if self._id is None: - # Assign a unique ID from the class-level counter + # Assign a unique ID self._id = uuid.uuid4().int - # self._id = Event._id_counter - # Event._id_counter += 1 - def propogate(self, result, select_all_keys: Optional[list[str]]=None) -> Union['EventResult', list['Event']]: + def propogate(self, result: Any) -> Union['EventResult', list['Event']]: """Propogate this event through the Dataflow.""" - - if self.dataflow is None: - return EventResult(self._id, result, self.metadata) targets = self.dataflow.get_neighbors(self.target) if len(targets) == 0: - return EventResult(self._id, result, self.metadata) + if len(self.call_stack) > 0: + caller = self.call_stack.pop() + + new_df = caller.dataflow + new_targets = caller.targets + if not isinstance(new_targets, list): + new_targets = [new_targets] + var_map = caller.var_map + if (x := caller.assign_result_to): + var_map[x] = result + + return [Event( + target, + var_map, + new_df, + _id=self._id, + metadata=self.metadata, + ) + + for target in new_targets] + + + else: + return EventResult(self._id, result, self.metadata) else: current_node = self.target - if isinstance(current_node, SelectAllNode): - assert select_all_keys - return current_node.propogate(self, targets, result, select_all_keys) - else: - return current_node.propogate(self, targets, result) + + return current_node.propogate(self, targets, result) @dataclass class EventResult(): diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index 091f3cf..6307834 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -1,5 +1,6 @@ -from abc import ABC -from typing import Any, Generic, Protocol, Type, TypeVar +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, Union from cascade.dataflow.dataflow import DataFlow, InvokeMethod, Operator T = TypeVar('T') @@ -25,6 +26,23 @@ def my_compiled_method(variable_map: dict[str, Any], state: T) -> Any def __call__(self, variable_map: dict[str, Any], state: T) -> Any: ... """@private""" +@dataclass +class Block(ABC): + var_map_writes: list[str] + var_map_reads: list[str] + name: str + function_call: Union[MethodCall, 'StatelessMethodCall'] + # TODO: remove "None" + raw_method_string: str = None + + def call(self, *args, **kwargs) -> Any: + return self.function_call(*args, **kwargs) + + +class StatelessMethodCall(Protocol): + def __call__(self, variable_map: dict[str, Any]) -> Any: ... + """@private""" + class StatefulOperator(Generic[T], Operator): """An abstraction for a user-defined python class. @@ -38,7 +56,8 @@ class StatefulOperator(Generic[T], Operator): methods, instead reading and modifying the underlying class `T` through a state variable, see `handle_invoke_method`. """ - def __init__(self, entity: Type[T], methods: dict[str, MethodCall[T]], dataflows: dict[str, DataFlow]): + # TODO: keyby should not be optional + def __init__(self, entity: Type[T], methods: dict[str, Block], dataflows: dict[str, DataFlow], keyby: str=""): """Create the StatefulOperator from a class and its compiled methods. Typically, a class could be comprised of split and non-split methods. Take the following example: @@ -88,9 +107,10 @@ def user_buy_item_1(variable_map: dict[str, Any], state: User): ``` """ # methods maps function name to a function. Ideally this is done once in the object - self._methods = methods + self.methods = methods self.entity = entity self.dataflows = dataflows + self.keyby = keyby """A mapping from method names to DataFlows""" @@ -98,7 +118,7 @@ def handle_init_class(self, *args, **kwargs) -> T: """Create an instance of the underlying class. Equivalent to `T.__init__(*args, **kwargs)`.""" return self.entity(*args, **kwargs) - def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any], state: T) -> dict[str, Any]: + def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any], state: T): """Invoke the method of the underlying class. The `cascade.dataflow.dataflow.InvokeMethod` object must contain a method identifier @@ -106,25 +126,25 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self._methods[method.method_name](variable_map=variable_map, state=state) + return self.methods[method.method_name].call(variable_map=variable_map, state=state) + def get_method_rw_set(self, method_name: str): + return super().get_method_rw_set(method_name) + def name(self): return self.entity.__name__ -class StatelessMethodCall(Protocol): - def __call__(self, variable_map: dict[str, Any]) -> Any: ... - """@private""" - class StatelessOperator(Operator): """A StatelessOperator refers to a stateless function and therefore only has one dataflow.""" - def __init__(self, methods: dict[str, StatelessMethodCall], dataflow: DataFlow): - self._methods = methods - self.dataflow = dataflow + def __init__(self, entity: Type, methods: dict[str, Block], dataflows: dict[str, DataFlow]): + self.entity = entity + self.methods = methods + self.dataflows = dataflows - def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any]) -> dict[str, Any]: + def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any]): """Invoke the method of the underlying class. The `cascade.dataflow.dataflow.InvokeMethod` object must contain a method identifier @@ -132,8 +152,12 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self._methods[method.method_name](variable_map=variable_map) + return self.methods[method.method_name].call(variable_map=variable_map) + + def get_method_rw_set(self, method_name: str): + return super().get_method_rw_set(method_name) def name(self) -> str: - return self.dataflow.name + return self.entity.__name__ + diff --git a/src/cascade/descriptors/class_descriptor.py b/src/cascade/descriptors/class_descriptor.py index 40271a4..4310f15 100644 --- a/src/cascade/descriptors/class_descriptor.py +++ b/src/cascade/descriptors/class_descriptor.py @@ -18,6 +18,12 @@ def __init__( self.module_node: nodes.Module = module_node self.class_node: nodes.ClassDef = class_node self.methods_dec: list[MethodDescriptor] = methods_dec + + self.is_stateless = True + for method in methods_dec: + if method.method_name == "__init__": + self.is_stateless = False + break def get_method_by_name(self, name: str): return next(m for m in self.methods_dec if m.method_name == name) diff --git a/src/cascade/frontend/ast_visitors/extract_type_visitor.py b/src/cascade/frontend/ast_visitors/extract_type_visitor.py index 3634910..3be142c 100644 --- a/src/cascade/frontend/ast_visitors/extract_type_visitor.py +++ b/src/cascade/frontend/ast_visitors/extract_type_visitor.py @@ -1,7 +1,7 @@ from klara.core.ssa_visitors import AstVisitor from klara.core.nodes import AnnAssign, Arg from klara.core import nodes - +from klara.core.node_classes import Name class ExtractTypeVisitor(AstVisitor): @@ -23,11 +23,11 @@ def visit_annassign(self, node: AnnAssign): def visit_arg(self, arg: Arg): annotation = arg.annotation var_type = type(annotation) + # TODO: Find a better way to get the SSA version from Arg + id: str = arg.arg + "_0" if var_type == nodes.Const: - id: str = arg.arg self.type_map[id] = annotation.value elif annotation != None: - id: str = arg.arg self.type_map[id] = str(annotation.id) def get_type_map(self) -> dict[str, str]: diff --git a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py index 51bd9dc..332d204 100644 --- a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py +++ b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py @@ -35,8 +35,8 @@ def extract_statment_list(self): statements.append(statement) variable_getter = VariableGetter.get_variable(b) targets, values = variable_getter.targets, variable_getter.values - statement.targets = targets - statement.values = values + statement.targets = [t.__repr__() for t in targets] + statement.values = [v.__repr__() for v in values] contains_attribute, attribute = ContainsAttributeVisitor.check_return_attribute(b) if contains_attribute: if attribute.value.id != 'self': diff --git a/src/cascade/frontend/generator/generate_dataflow.py b/src/cascade/frontend/generator/generate_dataflow.py index 5bb1182..3bcd62d 100644 --- a/src/cascade/frontend/generator/generate_dataflow.py +++ b/src/cascade/frontend/generator/generate_dataflow.py @@ -1,16 +1,17 @@ +from cascade.dataflow.operator import Block from cascade.frontend.generator.split_function import SplitFunction -from cascade.dataflow.dataflow import DataFlow, OpNode, InvokeMethod, Edge +from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, OpNode, InvokeMethod, Edge class GenerateDataflow: """ Generates dataflow """ - def __init__(self, split_functions: list[SplitFunction], instance_type_map: dict[str, str]): + def __init__(self, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args): #TODO: add buildcontext that contains class name and target method self.split_functions = split_functions - class_name = "class_name" # TODO: remove placeholder - self.df = DataFlow(class_name) + self.df = DataFlow(method_name, op_name, args) + self.blocks: list[Block] = [] self.instance_type_map = instance_type_map def generate_dataflow(self): @@ -22,14 +23,20 @@ def build_dataflow(self): """ nodes = [] for split in self.split_functions: - node = OpNode(split.class_name, InvokeMethod(split.method_name)) + node = CallLocal(InvokeMethod(split.method_name)) self.df.add_node(node) nodes.append([node]) if split.remote_calls: # TODO: instance_name -> correct entity (maybe using buildcontext/ instance type map) - next_nodes = [OpNode(self.instance_type_map[remote.instance_name], InvokeMethod(remote.attribute), assign_result_to=remote.target) - for remote in split.remote_calls] + next_nodes = [] + for remote in split.remote_calls: + df = DataflowRef(self.instance_type_map[remote.instance_name], remote.attribute) + args = df.get_dataflow.args + # TODO: proper variable renaming + vars = {arg: arg for arg in args} + call = CallEntity(df, vars, assign_result_to=remote.target) + next_nodes.append(call) nodes.append(next_nodes) self.df.entry = nodes[0][0] @@ -47,7 +54,7 @@ def extract_remote_method_calls(self): split.extract_remote_method_calls() @classmethod - def generate(cls, split_functions: list[SplitFunction], instance_type_map: dict[str, str]) -> DataFlow: - c = cls(split_functions, instance_type_map) + def generate(cls, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args) -> tuple[DataFlow, list[Block]]: + c = cls(split_functions, instance_type_map, method_name, op_name, args) c.generate_dataflow() - return c.df \ No newline at end of file + return c.df, c.blocks \ No newline at end of file diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/generate_split_functions.py index c90a9a6..2580081 100644 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ b/src/cascade/frontend/generator/generate_split_functions.py @@ -1,21 +1,24 @@ from itertools import count +from typing import List, Type import networkx as nx +from cascade.dataflow.dataflow import DataFlow, DataflowRef, Edge +from cascade.dataflow.operator import Block +from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph -from cascade.frontend.generator.split_function import SplitFunction +from cascade.frontend.generator.split_function import SplitFunction, SplitFunction2, to_entity_call from klara.core import nodes -from klara.core.cfg import RawBasicBlock -class GenerateSplittFunctions: +class GenerateSplitFunctions: - def __init__(self, dataflow_graph: StatementDataflowGraph, class_name: str, entities: list[str], instance_type_map: dict[str, str]): + def __init__(self, dataflow_graph: StatementDataflowGraph, class_name: str, entity_map: dict[str, str]): self.dataflow_graph: StatementDataflowGraph = dataflow_graph self.class_name: str = class_name - self.entities: list[str] = entities - self.instance_type_map: dict[str, str] = instance_type_map # {"instance_name": "EntityType"} + self.entity_map: dict[str, str] = entity_map # {"instance_name": "EntityType"} self.dataflow_node_map = dict() self.counter = count() self.split_functions = [] @@ -27,7 +30,7 @@ def generate_split_functions(self): # targets = copy.copy(entry_node.targets) continuation = list(G.nodes) while self.invokes_remote_entity(continuation): - first_half, continuation = self.split_fuction(G) + first_half, continuation = self.split_function(G) self.add_split_function(first_half) G = G.subgraph(continuation) # TODO: Add a new source node to continuation @@ -45,19 +48,13 @@ def add_split_function(self, statements: list[Statement]): self.split_functions.append(split_f) def value_is_entity(self, value: nodes.Name) -> bool: - value_id = value.id - instance_type_map: dict[str,str] = self.instance_type_map - if not value_id in instance_type_map: - return False - entity_type: str = instance_type_map[value_id] - return entity_type in self.entities - + return value.id in self.entity_map def invokes_remote_entity(self, statments: list[Statement]) -> bool: """Returns whether statements contains a remote invocation""" return any(s.is_remote() for s in statments) - def split_fuction(self, G: nx.DiGraph): + def split_function(self, G: nx.DiGraph) -> tuple[list[Statement], list[Statement]]: """ Produces split functions. Assumes that the runtime will always return to initial function call. Therefore functions containing a remote function call (one to a remote entity) will be split into two functions: one function adding the keys to the stack of the remote entities to call. And the continuation which the @@ -68,34 +65,135 @@ def split_fuction(self, G: nx.DiGraph): - Should also contain a liveness analyses to determine which variables should be passed on to the continuation. """ source: Statement = self.dataflow_graph.get_source_node() - first_half = set() # A set of nodes that are in the first half of the split function. + first_half = [] # A set of nodes that are in the first half of the split function. for n in G.nodes: n: Statement if n == source or not n.is_remote(): continue elif self.no_remote_dependencies_on_path(G, source, n): self.add_nodes_path_to_first_half(G, source, n, first_half) - continuation = set(G.nodes) - first_half # The set of nodes in the continuation. + fh_set = set(first_half) + continuation = [] + for node in G.nodes: + if node not in fh_set: + continuation.append(node) return first_half, continuation def no_remote_dependencies_on_path(self, G: nx.DiGraph, source: Statement, target: Statement) -> bool: + print(source, target) for path in self.get_all_simple_paths(G, source, target): for n in path: if n not in [source, target] and n.is_remote(): return False return True - def add_nodes_path_to_first_half(self, G: nx.DiGraph, source: Statement, statement: Statement, split: set[Statement]): + def add_nodes_path_to_first_half(self, G: nx.DiGraph, source: Statement, statement: Statement, split: list[Statement]): for path in self.get_all_simple_paths(G, source, statement): for n in path: - split.add(n) + split.append(n) def get_all_simple_paths(self, G: nx.DiGraph, source: Statement, target: Statement): return nx.all_simple_paths(G, source=source, target=target) @classmethod - def generate(cls, dataflow_graph: StatementDataflowGraph, class_name: str, entities: list[str], instance_type_map: dict[str, str]): - c = cls(dataflow_graph, class_name, entities, instance_type_map) + def generate(cls, dataflow_graph: StatementDataflowGraph, class_name: str, entity_map: dict[str, str]): + c = cls(dataflow_graph, class_name, entity_map) c.generate_split_functions() return c.split_functions + + +class GroupStatements: + + # todo: cfg should be control flow graph, statements should also be a graph + # list only works for functions with no control flow + # instead, generate_grouped should take a list of nodes, where each node is a stament, + # and create a graph of nodes where each node is a list of statments + # thus statements are grouped if they are all local and in the same block of control flow + def __init__(self, function_def: nodes.FunctionDef): + self.function_def = function_def + + def build_cfg(self): + cfg: StatementDataflowGraph = DataflowGraphBuilder.build([self.function_def] + self.function_def.body) + self.type_map = ExtractTypeVisitor.extract(self.function_def) + cfg.name = self.function_def.name + + statements = list(cfg.get_nodes()) + statements.sort(key=lambda s: s.block_num) + self.statements = statements # TODO: for more complex control flow, use CFG structure instead + self._grouped_statements: List[List[Statement]] = [] + self.cfg = cfg + + def generate_grouped_statements(self) -> List[List[Statement]]: + entry_node: Statement = self.statements[0] + assert type(entry_node.block) == nodes.FunctionDef + + grouped_statements = [] + continuation = self.statements[1:] + while len(continuation) > 0: + first_half, continuation = self.split_statements(continuation) + grouped_statements.append(first_half) + + self._grouped_statements = grouped_statements + return grouped_statements + + def split_statements(self, statements: list[Statement]) -> tuple[list[Statement], list[Statement]]: + """ + Split a list of statements, by grouping together statements that are not remote calls. + + As an example, suppose r and s are both statements, where r is a remote call and s is not. + + Here is how the list gets split: + [r, s, s, r, s] -> [r] + [s, s, r, s] + [s, s, r, s, s] -> [s, s] + [r, s, s] + [s, s, s] -> [s, s, s] + [] + """ + assert len(statements) > 0 + + if statements[0].is_remote(): + return [statements[0]], statements[1:] + + # find the next remote call + i = 0 + first_half = [] + while i < len(statements) and not statements[i].is_remote(): + first_half.append(statements[i]) + i += 1 + + continuation = statements[i:] + return first_half, continuation + + def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> tuple[DataFlow, List[Block]]: + self.build_cfg() + + self.generate_grouped_statements() + + blocks = [] + block_num = 0 + + args = self.function_def.args + df = DataFlow("name", "op_name", args) + + last_node = None + for split in self._grouped_statements: + print(split) + if len(split) == 1 and split[0].is_remote(): + # Entity call + node = to_entity_call(split[0], self.type_map, dataflows) + else: + # Group statements together, into a block + s = SplitFunction2(split, self.cfg.name, block_num, op_name) + block_num += 1 + node, block = s.to_block() + blocks.append(block) + + + if last_node == None: + last_node = node + df.add_node(node) + else: + df.add_edge(Edge(last_node, node)) + last_node = node + + return df, blocks + diff --git a/src/cascade/frontend/generator/split_function.py b/src/cascade/frontend/generator/split_function.py index dcc30d0..021e59c 100644 --- a/src/cascade/frontend/generator/split_function.py +++ b/src/cascade/frontend/generator/split_function.py @@ -1,7 +1,10 @@ from textwrap import indent from dataclasses import dataclass, field +from typing import Union +from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod +from cascade.dataflow.operator import Block from cascade.frontend.util import to_camel_case from cascade.frontend.intermediate_representation import Statement from cascade.frontend.ast_visitors.replace_name import ReplaceName @@ -10,6 +13,7 @@ from klara.core.cfg import RawBasicBlock from klara.core import nodes +from klara.core.node_classes import Name @dataclass class SplitFunction: @@ -77,3 +81,113 @@ def add_statement_to_remote_call_set(self, statement: Statement): self.remote_calls.append(remote_call) +def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: dict[DataflowRef, DataFlow]) -> CallEntity: + """Transform a remote statement to an entity call.""" + writes = statement.targets + assert statement.is_remote() + assert len(writes) <= 1 + if len(writes) == 0: + assign = None + else: + assign = list(writes)[0] + + # repr includes version + operator_var, dataflow_name = repr(statement.attribute.value), statement.attribute.attr + + if operator_var in type_map: + operator_name = type_map[operator_var] + key = str(statement.attribute.value) + else: + # assume stateless operator + operator_name = operator_var + key = None + + dataflow = DataflowRef(operator_name, dataflow_name) + + args = statement.values.copy() + args.remove(operator_var) + df_args = dataflows[dataflow].args + + return CallEntity(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,key=key) + + +class SplitFunction2: + def __init__(self, statements: list[Statement], method_base_name: str, block_num: int, class_name: str): + assert len(statements) > 0 + # A block of statements should have no remote calls + assert all([not s.is_remote() for s in statements]) + + self.statements = statements + self.method_base_name = method_base_name + self.class_name = class_name + self.block_num = block_num + + writes, reads = set(), set() + for s in statements: + if type(s.block) != nodes.FunctionDef: + writes.update(t for t in s.targets) + reads.update(v for v in s.values) + + # If we assign a variable inside a function + # that means this variable can only have been assigned in this function, + # thanks to SSA. Thus we can remove it from reads, as it is local. + reads.difference_update(writes) + + # Additionally, writes with higher versions will override writes + # with lower versions. + # e.g. a_0 = 2 + # a_1 = 4 + # we want to remove a_0 from writes, as it will never be read by future + # blocks + + # writes.update + + self.reads = reads + self.writes = writes + + + def to_block(self) -> tuple[CallLocal, Block]: + local_scope = {} + raw_str = self.to_string() + exec(self.to_string(), {}, local_scope) + method_name = self.get_method_name() + fn = local_scope[method_name] + return CallLocal(InvokeMethod(method_name)), Block(list(self.writes), list(self.reads), method_name, fn, raw_str) + + def get_method_name(self): + return f"{self.method_base_name}_{self.block_num}" + + def to_string(self) -> str: + indent_prefix: str = ' ' * 4 # indent using 4 spaces. + body: str = indent(self.body_to_string(), indent_prefix) + method_signature: str = self.get_method_signature() + compiled_method_as_string: str = f'def {self.get_method_name()}({method_signature}):\n{body}' + return compiled_method_as_string + + def get_method_signature(self) -> str: + return f'variable_map, state' + + def body_to_string(self) -> str: + body = [] + + # Read from the variable map + for v in sorted(self.reads - self.writes): + if not (v in [ 'self_0','self']): + body.append(f'{v} = variable_map[\'{v}\']') + + # Write statements + for statement in self.statements: + block: RawBasicBlock = statement.block + if type(block) == nodes.FunctionDef: + continue + ReplaceName.replace(block, 'self', 'state') + + body.append(unparse(block)) + + if 'return' not in body[-1]: + # Write to the variable map + for v in sorted(self.writes - self.reads): + if not (v in [ 'self_0','self']): + body.append(f'variable_map[\'{v}\'] = {v}') + body.append('return None') + return "\n".join(body) \ No newline at end of file diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index a4a8677..e0d0177 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -9,7 +9,17 @@ def unparse(block: RawBasicBlock): case nodes.Return: return f'return {unparse(block.value)}' case nodes.AugAssign: - return f'{unparse(block.target)} {block.op}= {unparse(block.value)}' + raise NotImplementedError() + # TODO: augassign does not work well with ssa + # e.g. + # a = 1 + # a += 2 + # will generate: + # a_0 = 1 + # a_1 += 2 + # The last line should be desugared into + # a_1 = a_0 + 2 (perhapse with a n Ast.Visitor?) + return f'{repr(block.target)} {block.op}= {unparse(block.value)}' case nodes.Assign: target, *rest = block.targets return f'{repr(target)} = {unparse(block.value)}' diff --git a/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py b/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py index e495d89..54e2900 100644 --- a/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py +++ b/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py @@ -1,6 +1,9 @@ from dataclasses import dataclass +from typing import Iterable import networkx as nx +from cascade.frontend.intermediate_representation.statement import Statement + @dataclass class StatementDataflowGraph: @@ -14,8 +17,11 @@ class StatementDataflowGraph: def set_name(self, name: str): self.name = name - def get_nodes(self): + def get_nodes(self) -> Iterable[Statement]: return self.graph.nodes - def get_source_node(self): + def get_edges(self) -> Iterable[tuple[int, int]]: + return [(u.block_num, v.block_num) for u, v in self.graph.edges] + + def get_source_node(self) -> Statement: return next(iter(self.get_nodes())) diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index ef22ea9..9b7eec6 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -13,7 +13,7 @@ from pyflink.datastream import ProcessFunction, StreamExecutionEnvironment from pyflink.datastream.output_tag import OutputTag import pickle -from cascade.dataflow.dataflow import CollectNode, CollectTarget, Event, EventResult, Filter, InitClass, InvokeMethod, Node, OpNode, SelectAllNode, StatelessOpNode +from cascade.dataflow.dataflow import CollectNode, CollectTarget, Event, EventResult, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode from cascade.dataflow.operator import StatefulOperator, StatelessOperator from confluent_kafka import Producer, Consumer import logging diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index a955e9c..7a73ed5 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -1,8 +1,8 @@ from logging import Filter import threading -from typing import Type +from typing import List, Type, Union from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.dataflow.dataflow import CollectNode, Event, EventResult, InitClass, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode +from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod, OpNode, StatelessOpNode from queue import Empty, Queue class PythonStatefulOperator(): @@ -11,32 +11,26 @@ def __init__(self, operator: StatefulOperator): self.states = {} def process(self, event: Event): - assert(isinstance(event.target, OpNode)) - assert(event.target.entity == self.operator.entity) + assert(isinstance(event.target, CallLocal)) + assert(isinstance(event.dataflow.op, StatefulOperator)) - key = event.variable_map[event.target.read_key_from] + key = event.variable_map[event.dataflow.op.keyby] print(f"PythonStatefulOperator[{self.operator.entity.__name__}[{key}]]: {event}") - if isinstance(event.target.method_type, InitClass): + if isinstance(event.target.method, InitClass): result = self.operator.handle_init_class(*event.variable_map.values()) self.states[key] = result - elif isinstance(event.target.method_type, InvokeMethod): + elif isinstance(event.target.method, InvokeMethod): state = self.states[key] result = self.operator.handle_invoke_method( - event.target.method_type, + event.target.method, variable_map=event.variable_map, state=state, ) self.states[key] = state - - elif isinstance(event.target.method_type, Filter): - raise NotImplementedError() - - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result - + new_events = event.propogate(result) if isinstance(new_events, EventResult): yield new_events @@ -48,20 +42,17 @@ def __init__(self, operator: StatelessOperator): self.operator = operator def process(self, event: Event): - assert(isinstance(event.target, StatelessOpNode)) + assert(isinstance(event.target, CallLocal)) print(f"PythonStatelessOperator[{self.operator.dataflow.name}]: {event}") - if isinstance(event.target.method_type, InvokeMethod): + if isinstance(event.target.method, InvokeMethod): result = self.operator.handle_invoke_method( - event.target.method_type, + event.target.method, variable_map=event.variable_map, ) else: - raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method_type}") - - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result + raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method}") new_events = event.propogate(result) if isinstance(new_events, EventResult): @@ -69,6 +60,42 @@ def process(self, event: Event): else: yield from new_events +class PythonCollectOperator(): + def __init__(self): + self.state = {} + + def process(self, event: Event): + key = event.target.id + if key not in self.state: + self.state[key] = [event] + else: + self.state[key].append(event) + + n = len(event.dataflow.get_predecessors(event.target)) + print(f"PythonCollectOperator: collected {len(self.state[key])}/{n} for event {event._id}") + + if len(self.state[key]) == n: + var_map = {} + for event in self.state[key]: + var_map.update(event.variable_map) + + new_event = Event( + target=event.target, + variable_map=var_map, + dataflow=event.dataflow, + _id=event._id, + call_stack=event.call_stack, + metadata=event.metadata + ) + new_events = new_event.propogate(None) + if isinstance(new_events, EventResult): + yield new_events + else: + yield from new_events + + + + class PythonRuntime(): """Simple non-distributed runtime meant for testing that runs Dataflows locally.""" @@ -76,8 +103,9 @@ def __init__(self): self.events = Queue() self.results = Queue() self.running = False - self.statefuloperators: dict[Type, PythonStatefulOperator] = {} + self.statefuloperators: dict[str, PythonStatefulOperator] = {} self.statelessoperators: dict[str, PythonStatelessOperator] = {} + self.collect = PythonCollectOperator() def init(self): pass @@ -85,15 +113,21 @@ def init(self): def _consume_events(self): self.running = True def consume_event(event: Event): - if isinstance(event.target, OpNode): - yield from self.statefuloperators[event.target.entity].process(event) - elif isinstance(event.target, StatelessOpNode): - yield from self.statelessoperators[event.target.operator.dataflow.name].process(event) + if isinstance(event.target, CallLocal): + if isinstance(event.dataflow.op, StatefulOperator): + yield from self.statefuloperators[event.dataflow.op.name()].process(event) + else: + yield from self.statelessoperators[event.dataflow.op.name()].process(event) + elif isinstance(event.target, CallEntity): + new_events = event.propogate(None) + print(new_events) + if isinstance(new_events, EventResult): + yield new_events + else: + yield from new_events - elif isinstance(event.target, SelectAllNode): - raise NotImplementedError() elif isinstance(event.target, CollectNode): - raise NotImplementedError() + yield from self.collect.process(event) events = [] @@ -115,11 +149,11 @@ def consume_event(event: Event): def add_operator(self, op: StatefulOperator): """Add a `StatefulOperator` to the datastream.""" - self.statefuloperators[op.entity] = PythonStatefulOperator(op) + self.statefuloperators[op.name()] = PythonStatefulOperator(op) def add_stateless_operator(self, op: StatelessOperator): """Add a `StatelessOperator` to the datastream.""" - self.statelessoperators[op.dataflow.name] = PythonStatelessOperator(op) + self.statelessoperators[op.name()] = PythonStatelessOperator(op) def send(self, event: Event, flush=None): self.events.put(event) @@ -138,12 +172,18 @@ def __init__(self, runtime: PythonRuntime): self._events = runtime.events self.results = {} - def send(self, event: Event, block=True): - self._events.put(event) + def send(self, event: Union[Event, List[Event]], block=True): + if isinstance(event, list): + for e in event: + self._events.put(e) + id = e._id + else: + self._events.put(event) + id = event._id while block: er: EventResult = self._results_q.get(block=True) - if event._id == er.event_id: + if id == er.event_id: self.results[er.event_id] = er.result return er.result diff --git a/test_programs/expected/checkout_item.py b/test_programs/expected/checkout_item.py index 75a32fa..4ff2828 100644 --- a/test_programs/expected/checkout_item.py +++ b/test_programs/expected/checkout_item.py @@ -1,6 +1,6 @@ from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode +from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, Edge, InvokeMethod, OpNode from test_programs.target.checkout_item import User, Item def buy_item_0_compiled(variable_map: dict[str, Any], state: User) -> Any: @@ -8,20 +8,24 @@ def buy_item_0_compiled(variable_map: dict[str, Any], state: User) -> Any: def buy_item_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - item_price_0 = variable_map['item_price_0'] - state.balance -= item_price_0 + state.balance -= variable_map['item_price_0'] return state.balance >= 0 def get_price_0_compiled(variable_map: dict[str, Any], state: Item) -> Any: return state.price +def item_get_price_df(): + df = DataFlow("item.get_price") + n0 = CallLocal(InvokeMethod("get_price_0_compiled")) + df.entry = n0 + return df def user_buy_item_df(): df = DataFlow("user.buy_item") - n0 = OpNode(User, InvokeMethod("buy_item_0"), read_key_from="user_key") - n1 = OpNode(Item, InvokeMethod("get_price"), assign_result_to="item_price", read_key_from="item_key") - n2 = OpNode(User, InvokeMethod("buy_item_1"), read_key_from="user_key") + n0 = CallLocal(InvokeMethod("buy_item_0_compiled")) + n1 = CallEntity(item_get_price_df(), {}, "item_price_0") + n2 = CallLocal(InvokeMethod("buy_item_1_compiled")) df.add_edge(Edge(n0, n1)) df.add_edge(Edge(n1, n2)) df.entry = n0 diff --git a/test_programs/expected/checkout_two_items.py b/test_programs/expected/checkout_two_items.py deleted file mode 100644 index 9849ad5..0000000 --- a/test_programs/expected/checkout_two_items.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, OpNode, InvokeMethod, Edge -from cascade.dataflow.operator import StatefulOperator -from test_programs.target.checkout_two_items import User, Item - -def buy_two_items_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - -def buy_two_items_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - item_price_1_0 = variable_map['item_price_1_0'] - item_price_2_0 = variable_map['item_price_2_0'] - total_price_0 = item_price_1_0 + item_price_2_0 - state.balance -= total_price_0 - return state.balance >= 0 - -def get_price_0_compiled(variable_map: dict[str, Any], state: Item) -> Any: - return state.price - - -user_op = StatefulOperator( - User, - { - "buy_two_items_0": buy_two_items_0_compiled, - "buy_two_items_1": buy_two_items_1_compiled - }, - None) - -item_op = StatefulOperator( - Item, {"get_price": get_price_0_compiled}, None -) - -def user_buy_two_items_df(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(User, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n1 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price_1", - read_key_from="item1_key" - ) - n2 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price_2", - read_key_from="item1_key" - ) - n3 = OpNode(User, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - df.entry = n0 - return df - - -# For future optimizations (not used) -def user_buy_two_items_df_parallelized(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(User, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n3 = CollectNode(assign_result_to="item_prices", read_results_from="item_price") - n1 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 0), - read_key_from="item1_key" - ) - n2 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 1), - read_key_from="item1_key" - ) - n4 = OpNode(User, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n3)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.entry = n0 - return df - -user_op.dataflows = { - "buy_two_items": user_buy_two_items_df(), -} \ No newline at end of file diff --git a/test_programs/expected/deathstar_recommendation.py b/test_programs/expected/deathstar_recommendation.py deleted file mode 100644 index 8a8a727..0000000 --- a/test_programs/expected/deathstar_recommendation.py +++ /dev/null @@ -1,112 +0,0 @@ -from typing import Any, Literal -from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator - - -def get_recs_if_cond(variable_map: dict[str, Any]): - return variable_map["requirement"] == "distance" - -# list comprehension entry -def get_recs_if_body_0(variable_map: dict[str, Any]): - pass - - -# list comprehension body -def get_recs_if_body_1(variable_map: dict[str, Any]): - hotel_geo = variable_map["hotel_geo"] - lat, lon = variable_map["lat"], variable_map["lon"] - dist = hotel_geo.distance_km(lat, lon) - return (dist, variable_map["hotel_key"]) - -# after list comprehension -def get_recs_if_body_2(variable_map: dict[str, Any]): - distances = variable_map["distances"] - min_dist = min(distances, key=lambda x: x[0])[0] - variable_map["res"] = [hotel for dist, hotel in distances if dist == min_dist] - - -def get_recs_elif_cond(variable_map: dict[str, Any]): - return variable_map["requirement"] == "price" - - -# list comprehension entry -def get_recs_elif_body_0(variable_map: dict[str, Any]): - pass - - -# list comprehension body -def get_recs_elif_body_1(variable_map: dict[str, Any]): - return (variable_map["hotel_price"], variable_map["hotel_key"]) - -# after list comprehension -def get_recs_elif_body_2(variable_map: dict[str, Any]): - prices = variable_map["prices"] - min_price = min(prices, key=lambda x: x[0])[0] - variable_map["res"] = [hotel for price, hotel in prices if price == min_price] - - - -# a future optimization might instead duplicate this piece of code over the two -# branches, in order to reduce the number of splits by one -def get_recs_final(variable_map: dict[str, Any]): - return variable_map["res"] - - -recommend_op = StatelessOperator({ - "get_recs_if_cond": get_recs_if_cond, - "get_recs_if_body_0": get_recs_if_body_0, - "get_recs_if_body_1": get_recs_if_body_1, - "get_recs_if_body_2": get_recs_if_body_2, - "get_recs_elif_cond": get_recs_elif_cond, - "get_recs_elif_body_0": get_recs_elif_body_0, - "get_recs_elif_body_1": get_recs_elif_body_1, - "get_recs_elif_body_2": get_recs_elif_body_2, - "get_recs_final": get_recs_final, -}, None) - -def get_recommendations_df(): - df = DataFlow("get_recommendations") - n1 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_cond"), is_conditional=True) - n2 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_body_0")) - n3 = OpNode(Hotel, InvokeMethod("get_geo"), assign_result_to="hotel_geo", read_key_from="hotel_key") - n4 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_body_1"), assign_result_to="distance") - n5 = CollectNode("distances", "distance") - n6 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_body_2")) - ns1 = SelectAllNode(Hotel, n5, assign_key_to="hotel_key") - - n7 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_cond"), is_conditional=True) - n8 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_body_0")) - n9 = OpNode(Hotel, InvokeMethod("get_price"), assign_result_to="hotel_price", read_key_from="hotel_key") - n10 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_body_1"), assign_result_to="price") - n11 = CollectNode("prices", "price") - n12 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_body_2")) - ns2 = SelectAllNode(Hotel, n11, assign_key_to="hotel_key") - - - n13 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_final")) - - df.add_edge(Edge(n1, ns1, if_conditional=True)) - df.add_edge(Edge(n1, n7, if_conditional=False)) - df.add_edge(Edge(n7, ns2, if_conditional=True)) - df.add_edge(Edge(n7, n13, if_conditional=False)) - - # if branch - df.add_edge(Edge(ns1, n2)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.add_edge(Edge(n4, n5)) - df.add_edge(Edge(n5, n6)) - df.add_edge(Edge(n6, n13)) - - # elif branch - df.add_edge(Edge(ns2, n8)) - df.add_edge(Edge(n8, n9)) - df.add_edge(Edge(n9, n10)) - df.add_edge(Edge(n10, n11)) - df.add_edge(Edge(n11, n12)) - df.add_edge(Edge(n12, n13)) - - df.entry = n1 - return df - -recommend_op.dataflow = get_recommendations_df() \ No newline at end of file diff --git a/test_programs/expected/deathstar_search.py b/test_programs/expected/deathstar_search.py deleted file mode 100644 index cd20593..0000000 --- a/test_programs/expected/deathstar_search.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Any - -from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator - -# predicate 1 -def search_nearby_compiled_0(variable_map: dict[str, Any]): - pass - -# predicate 2 -def search_nearby_compiled_1(variable_map: dict[str, Any]): - hotel_geo: Geo = variable_map["hotel_geo"] - lat, lon = variable_map["lat"], variable_map["lon"] - dist = hotel_geo.distance_km(lat, lon) - variable_map["dist"] = dist - return dist < 10 - - -# body -def search_nearby_compiled_2(variable_map: dict[str, Any]): - return (variable_map["dist"], variable_map["hotel_key"]) - -# next line -def search_nearby_compiled_3(variable_map: dict[str, Any]): - distances = variable_map["distances"] - hotels = [hotel for dist, hotel in sorted(distances)[:5]] - return hotels - - -search_op = StatelessOperator({ - "search_nearby_compiled_0": search_nearby_compiled_0, - "search_nearby_compiled_1": search_nearby_compiled_1, - "search_nearby_compiled_2": search_nearby_compiled_2, - "search_nearby_compiled_3": search_nearby_compiled_3, -}, None) - -def search_nearby_df(): - df = DataFlow("search_nearby") - n1 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_0")) - n2 = OpNode(Hotel, InvokeMethod("get_geo"), assign_result_to="hotel_geo", read_key_from="hotel_key") - n3 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_1"), is_conditional=True) - n4 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_2"), assign_result_to="search_body") - n5 = CollectNode("distances", "search_body") - n0 = SelectAllNode(Hotel, n5, assign_key_to="hotel_key") - - n6 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_3")) - - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - - # if true make the body - df.add_edge(Edge(n3, n4, if_conditional=True)) - df.add_edge(Edge(n4, n5)) - # if false skip past - df.add_edge(Edge(n3, n5, if_conditional=False)) - - df.add_edge(Edge(n5, n6)) - - df.entry = n0 - return df - -search_op.dataflow = search_nearby_df() \ No newline at end of file diff --git a/test_programs/expected/deathstar_user.py b/test_programs/expected/deathstar_user.py deleted file mode 100644 index 64985ea..0000000 --- a/test_programs/expected/deathstar_user.py +++ /dev/null @@ -1,57 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator - -def order_compiled_entry_0(variable_map: dict[str, Any], state: User) -> Any: - pass - -def order_compiled_entry_1(variable_map: dict[str, Any], state: User) -> Any: - pass - -def order_compiled_if_cond(variable_map: dict[str, Any], state: User) -> Any: - return variable_map["hotel_reserve"] and variable_map["flight_reserve"] - -def order_compiled_if_body(variable_map: dict[str, Any], state: User) -> Any: - return True - -def order_compiled_else_body(variable_map: dict[str, Any], state: User) -> Any: - return False - -user_op = StatefulOperator( - User, - { - "order_compiled_entry_0": order_compiled_entry_0, - "order_compiled_entry_1": order_compiled_entry_1, - "order_compiled_if_cond": order_compiled_if_cond, - "order_compiled_if_body": order_compiled_if_body, - "order_compiled_else_body": order_compiled_else_body - }, - {} -) - -# For now, the dataflow will be serial instead of parallel (calling hotel, then -# flight). Future optimizations could try to automatically parallelize this. -# There could definetly be some slight changes to this dataflow depending on -# other optimizations aswell. (A naive system could have an empty first entry -# before the first entity call). -def user_order_df(): - df = DataFlow("user_order") - n0 = OpNode(User, InvokeMethod("order_compiled_entry_0"), read_key_from="user_key") - n1 = OpNode(Hotel, InvokeMethod("reserve"), assign_result_to="hotel_reserve", read_key_from="hotel_key") - n2 = OpNode(User, InvokeMethod("order_compiled_entry_1"), read_key_from="user_key") - n3 = OpNode(Flight, InvokeMethod("reserve"), assign_result_to="flight_reserve", read_key_from="flight_key") - n4 = OpNode(User, InvokeMethod("order_compiled_if_cond"), is_conditional=True, read_key_from="user_key") - n5 = OpNode(User, InvokeMethod("order_compiled_if_body"), read_key_from="user_key") - n6 = OpNode(User, InvokeMethod("order_compiled_else_body"), read_key_from="user_key") - - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.add_edge(Edge(n4, n5, if_conditional=True)) - df.add_edge(Edge(n4, n6, if_conditional=False)) - - df.entry = n0 - return df - -user_op.dataflows["order"] = user_order_df() diff --git a/test_programs/target/checkout_two_items.py b/test_programs/target/checkout_two_items.py deleted file mode 100644 index f6f6278..0000000 --- a/test_programs/target/checkout_two_items.py +++ /dev/null @@ -1,23 +0,0 @@ -import cascade - -@cascade.cascade -class User: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance - - def buy_two_items(self, item_1: 'Item', item_2: 'Item') -> bool: - item_price_1 = item_1.get_price() - item_price_2 = item_2.get_price() - total_price = item_price_1 + item_price_2 - self.balance -= total_price - return self.balance >= 0 - -@cascade.cascade -class Item: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price - - def get_price(self) -> int: - return self.price \ No newline at end of file diff --git a/test_programs/target/deathstar_recommendation.py b/test_programs/target/deathstar_recommendation.py deleted file mode 100644 index 5d6f12d..0000000 --- a/test_programs/target/deathstar_recommendation.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Literal -import cascade - -# Stateless -@cascade.cascade -class Recommendation(): - @staticmethod - def get_recommendations(requirement: Literal["distance", "price"], lat: float, lon: float) -> list[Hotel]: - if requirement == "distance": - distances = [(hotel.geo.distance_km(lat, lon), hotel) - for hotel in Hotel.__all__()] - min_dist = min(distances, key=lambda x: x[0]) - res = [hotel for dist, hotel in distances if dist == min_dist] - elif requirement == "price": - prices = [(hotel.price, hotel) - for hotel in Hotel.__all__()] - min_price = min(prices, key=lambda x: x[0]) - res = [hotel for rate, hotel in prices if rate == min_price] - - return res \ No newline at end of file diff --git a/test_programs/target/deathstar_search.py b/test_programs/target/deathstar_search.py deleted file mode 100644 index 845e709..0000000 --- a/test_programs/target/deathstar_search.py +++ /dev/null @@ -1,14 +0,0 @@ -import cascade - -# Stateless -@cascade.cascade -class Search(): - # Get the 5 nearest hotels - @staticmethod - def nearby(lat: float, lon: float, in_date: int, out_date: int): - distances = [ - (dist, hotel) - for hotel in Hotel.__all__() - if (dist := hotel.geo.distance_km(lat, lon)) < 10] - hotels = [hotel for dist, hotel in sorted(distances)[:5]] - return hotels \ No newline at end of file diff --git a/test_programs/target/deathstar_user.py b/test_programs/target/deathstar_user.py deleted file mode 100644 index dd87723..0000000 --- a/test_programs/target/deathstar_user.py +++ /dev/null @@ -1,16 +0,0 @@ -import cascade - -@cascade.cascade -class User(): - def __init__(self, user_id: str, password: str): - self.id = user_id - self.password = password - - def check(self, password): - return self.password == password - - def order(self, flight: Flight, hotel: Hotel): - if hotel.reserve() and flight.reserve(): - return True - else: - return False \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index eeebf60..be98e13 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -1,7 +1,5 @@ from textwrap import dedent -import networkx as nx - from klara.core.cfg import Cfg from klara.core import nodes @@ -49,3 +47,24 @@ def buy_item(self, item: 'Item') -> bool: (buy_item_body_0, buy_item_body_1) ] assert_expected_edges(df, expected_edges) + + +def test_ssa(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock): + total = Adder.add(item1.get_quantity(), item2.get_quantity()) + return total""") + + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + # TODO: check that the produced ssa code made variables for + # - item1.get_quantity() + # - item2.get_quantity() + df: StatementDataflowGraph = DataflowGraphBuilder.build([get_total] + get_total.body) + print(df.graph.nodes) + print(df.graph.edges) diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py new file mode 100644 index 0000000..a363f22 --- /dev/null +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -0,0 +1,61 @@ +from textwrap import dedent + +import networkx as nx + +from klara.core.cfg import Cfg +from klara.core import nodes + +from cascade.dataflow.dataflow import DataFlow, DataflowRef +from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder +from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements +from cascade.frontend.generator.split_function import SplitFunction2, to_entity_call +from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph +from cascade.frontend.util import setup_cfg + +def test_call_entity(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock): + a = item1.get_quantity() + b = item2.get_quantity() + return a+b""") + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = GroupStatements(get_total) + sf.build_cfg() + + dataflows = { + DataflowRef("Test", "get_total"): DataFlow("get_total", "Test", ["item1", "item2"]), + DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Stock", []) + } + + df, blocks = sf.build(dataflows, "Test") + print(df.to_dot()) + print(blocks) + +def test_simple_block(): + program: str = dedent(""" + class Test: + + def add(x: int, y: int): + return x+y""") + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = GroupStatements(get_total) + + dataflows = { + DataflowRef("Test", "add"): DataFlow("get_total", "Test", ["x", "y"]), + } + + df, blocks = sf.build(dataflows, "Test") + + assert len(blocks) == 1 + assert blocks[0].call({"x_0": 3, "y_0":5 }, None) == 8 diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py new file mode 100644 index 0000000..961e580 --- /dev/null +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -0,0 +1,72 @@ +from textwrap import dedent + +import networkx as nx + +from klara.core.cfg import Cfg +from klara.core import nodes + +from cascade.dataflow.dataflow import DataFlow, DataflowRef +from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder +from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements +from cascade.frontend.generator.split_function import SplitFunction2, to_entity_call +from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph +from cascade.frontend.util import setup_cfg + +def test_split_functions(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock, y: int): + a = 10 + b = a + 3 + x = item1.get_quantity() + y = item2.get_quantity() + total = Adder.add(x, y) + total = total + a + b + total = total - 23 + return total""") + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + + sf = GroupStatements(get_total) + sf.build_cfg() + + dataflows = { + DataflowRef("Adder", "add"): DataFlow("add", "Adder", ["a", "b"]), + DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Item", []) + } + + + # TODO: Check + statements = sf.generate_grouped_statements() + + df, blocks = sf.build(dataflows, "Test") + print(df.to_dot()) + print(blocks) + + + +# [ +# Statement(block_num=0, block=Function get_total in scope Class "Test" in scope Module, targets=[item1_0, item2_0], values=[item1_0, item2_0], remote_call=False, attribute=None), +# Statement(block_num=1, block=Assign: (a_0,) = 10, targets=[a_0], values=[], remote_call=False, attribute=None), +# Statement(block_num=2, block=Assign: (b_0,) = BinOp: a_0 + 3, targets=[b_0], values=[a_0], remote_call=False, attribute=None), +# Statement(block_num=3, block=Assign: (x_0,) = Call: item1_0.get_quantity(()), targets=[x_0], values=[item1_0], remote_call=True, attribute=item1_0.get_quantity), +# Statement(block_num=4, block=Assign: (y_0,) = Call: item2_0.get_quantity(()), targets=[y_0], values=[item2_0], remote_call=True, attribute=item2_0.get_quantity), +# Statement(block_num=5, block=Assign: (total_0,) = Call: Adder.add((x_0, y_0)), targets=[total_0], values=[Adder, x_0, y_0], remote_call=True, attribute=Adder.add), +# Statement(block_num=6, block=, targets=[total_1], values=[a_0, b_0], remote_call=False, attribute=None), +# Statement(block_num=7, block=, targets=[total_2], values=[], remote_call=False, attribute=None), +# Statement(block_num=8, block=, targets=[], values=[total_2], remote_call=False, attribute=None)] + +# [ +# (0, 3), +# (0, 4), +# (3, 5), +# (4, 5), +# (1, 2), +# (1, 6), +# (2, 6), +# (7, 8)] \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_select_all.py b/tests/integration/flink-runtime/test_select_all.py index 602858d..9e0360d 100644 --- a/tests/integration/flink-runtime/test_select_all.py +++ b/tests/integration/flink-runtime/test_select_all.py @@ -1,155 +1,155 @@ -""" -The select all operator is used to fetch all keys for a single entity -""" -import math -import random -from dataclasses import dataclass -from typing import Any - -from pyflink.datastream.data_stream import CloseableIterator - -from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, Event, EventResult, InitClass, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime, FlinkStatelessOperator -import time -import pytest - -@dataclass -class Geo: - x: int - y: int - -class Hotel: - def __init__(self, name: str, loc: Geo): - self.name = name - self.loc = loc - - def get_name(self) -> str: - return self.name +# """ +# The select all operator is used to fetch all keys for a single entity +# """ +# import math +# import random +# from dataclasses import dataclass +# from typing import Any + +# from pyflink.datastream.data_stream import CloseableIterator + +# from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, Event, EventResult, InitClass, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode +# from cascade.dataflow.operator import StatefulOperator, StatelessOperator +# from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime, FlinkStatelessOperator +# import time +# import pytest + +# @dataclass +# class Geo: +# x: int +# y: int + +# class Hotel: +# def __init__(self, name: str, loc: Geo): +# self.name = name +# self.loc = loc + +# def get_name(self) -> str: +# return self.name - def distance(self, loc: Geo) -> float: - return math.sqrt((self.loc.x - loc.x) ** 2 + (self.loc.y - loc.y) ** 2) +# def distance(self, loc: Geo) -> float: +# return math.sqrt((self.loc.x - loc.x) ** 2 + (self.loc.y - loc.y) ** 2) - def __repr__(self) -> str: - return f"Hotel({self.name}, {self.loc})" - - -def distance_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: - loc = variable_map["loc"] - return math.sqrt((state.loc.x - loc.x) ** 2 + (state.loc.y - loc.y) ** 2) - -def get_name_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: - return state.name - -hotel_op = StatefulOperator(Hotel, - {"distance": distance_compiled, - "get_name": get_name_compiled}, {}) - - - -def get_nearby(hotels: list[Hotel], loc: Geo, dist: float): - return [hotel.get_name() for hotel in hotels if hotel.distance(loc) < dist] - - -# We compile just the predicate, the select is implemented using a selectall node -def get_nearby_predicate_compiled_0(variable_map: dict[str, Any]): - pass - -def get_nearby_predicate_compiled_1(variable_map: dict[str, Any]) -> bool: - loc = variable_map["loc"] - dist = variable_map["dist"] - hotel_dist = variable_map["hotel_distance"] - return hotel_dist < dist - -def get_nearby_body_compiled_0(variable_map: dict[str, Any]): - pass - -def get_nearby_body_compiled_1(variable_map: dict[str, Any]) -> str: - return variable_map["hotel_name"] - -get_nearby_op = StatelessOperator({ - "get_nearby_predicate_compiled_0": get_nearby_predicate_compiled_0, - "get_nearby_predicate_compiled_1": get_nearby_predicate_compiled_1, - "get_nearby_body_compiled_0": get_nearby_body_compiled_0, - "get_nearby_body_compiled_1": get_nearby_body_compiled_1, -}, None) - -# dataflow for getting all hotels within region -df = DataFlow("get_nearby") -n7 = CollectNode("get_nearby_result", "get_nearby_body") -n0 = SelectAllNode(Hotel, n7, assign_key_to="hotel_key") -n1 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_0")) -n2 = OpNode(Hotel, InvokeMethod("distance"), assign_result_to="hotel_distance", read_key_from="hotel_key") -n3 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_1"), is_conditional=True) -n4 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_0")) -n5 = OpNode(Hotel, InvokeMethod("get_name"), assign_result_to="hotel_name", read_key_from="hotel_key") -n6 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_1"), assign_result_to="get_nearby_body") - -df.add_edge(Edge(n0, n1)) -df.add_edge(Edge(n1, n2)) -df.add_edge(Edge(n2, n3)) -df.add_edge(Edge(n3, n4, if_conditional=True)) -df.add_edge(Edge(n3, n7, if_conditional=False)) -df.add_edge(Edge(n4, n5)) -df.add_edge(Edge(n5, n6)) -df.add_edge(Edge(n6, n7)) -get_nearby_op.dataflow = df - -@pytest.mark.integration -def test_nearby_hotels(): - runtime = FlinkRuntime("test_nearby_hotels") - runtime.init() - runtime.add_operator(hotel_op) - runtime.add_stateless_operator(get_nearby_op) - - # Create Hotels - hotels = [] - init_hotel = OpNode(Hotel, InitClass(), read_key_from="name") - random.seed(42) - for i in range(20): - coord_x = random.randint(-10, 10) - coord_y = random.randint(-10, 10) - hotel = Hotel(f"h_{i}", Geo(coord_x, coord_y)) - event = Event(init_hotel, {"name": hotel.name, "loc": hotel.loc}, None) - runtime.send(event) - hotels.append(hotel) - - collected_iterator: CloseableIterator = runtime.run(run_async=True, output='collect') - records = [] - def wait_for_event_id(id: int) -> EventResult: - for record in collected_iterator: - records.append(record) - print(f"Collected record: {record}") - if record.event_id == id: - return record +# def __repr__(self) -> str: +# return f"Hotel({self.name}, {self.loc})" + + +# def distance_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: +# loc = variable_map["loc"] +# return math.sqrt((state.loc.x - loc.x) ** 2 + (state.loc.y - loc.y) ** 2) + +# def get_name_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: +# return state.name + +# hotel_op = StatefulOperator(Hotel, +# {"distance": distance_compiled, +# "get_name": get_name_compiled}, {}) + + + +# def get_nearby(hotels: list[Hotel], loc: Geo, dist: float): +# return [hotel.get_name() for hotel in hotels if hotel.distance(loc) < dist] + + +# # We compile just the predicate, the select is implemented using a selectall node +# def get_nearby_predicate_compiled_0(variable_map: dict[str, Any]): +# pass + +# def get_nearby_predicate_compiled_1(variable_map: dict[str, Any]) -> bool: +# loc = variable_map["loc"] +# dist = variable_map["dist"] +# hotel_dist = variable_map["hotel_distance"] +# return hotel_dist < dist + +# def get_nearby_body_compiled_0(variable_map: dict[str, Any]): +# pass + +# def get_nearby_body_compiled_1(variable_map: dict[str, Any]) -> str: +# return variable_map["hotel_name"] + +# get_nearby_op = StatelessOperator({ +# "get_nearby_predicate_compiled_0": get_nearby_predicate_compiled_0, +# "get_nearby_predicate_compiled_1": get_nearby_predicate_compiled_1, +# "get_nearby_body_compiled_0": get_nearby_body_compiled_0, +# "get_nearby_body_compiled_1": get_nearby_body_compiled_1, +# }, None) + +# # dataflow for getting all hotels within region +# df = DataFlow("get_nearby") +# n7 = CollectNode("get_nearby_result", "get_nearby_body") +# n0 = SelectAllNode(Hotel, n7, assign_key_to="hotel_key") +# n1 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_0")) +# n2 = OpNode(Hotel, InvokeMethod("distance"), assign_result_to="hotel_distance", read_key_from="hotel_key") +# n3 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_1"), is_conditional=True) +# n4 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_0")) +# n5 = OpNode(Hotel, InvokeMethod("get_name"), assign_result_to="hotel_name", read_key_from="hotel_key") +# n6 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_1"), assign_result_to="get_nearby_body") + +# df.add_edge(Edge(n0, n1)) +# df.add_edge(Edge(n1, n2)) +# df.add_edge(Edge(n2, n3)) +# df.add_edge(Edge(n3, n4, if_conditional=True)) +# df.add_edge(Edge(n3, n7, if_conditional=False)) +# df.add_edge(Edge(n4, n5)) +# df.add_edge(Edge(n5, n6)) +# df.add_edge(Edge(n6, n7)) +# get_nearby_op.dataflow = df + +# @pytest.mark.integration +# def test_nearby_hotels(): +# runtime = FlinkRuntime("test_nearby_hotels") +# runtime.init() +# runtime.add_operator(hotel_op) +# runtime.add_stateless_operator(get_nearby_op) + +# # Create Hotels +# hotels = [] +# init_hotel = OpNode(Hotel, InitClass(), read_key_from="name") +# random.seed(42) +# for i in range(20): +# coord_x = random.randint(-10, 10) +# coord_y = random.randint(-10, 10) +# hotel = Hotel(f"h_{i}", Geo(coord_x, coord_y)) +# event = Event(init_hotel, {"name": hotel.name, "loc": hotel.loc}, None) +# runtime.send(event) +# hotels.append(hotel) + +# collected_iterator: CloseableIterator = runtime.run(run_async=True, output='collect') +# records = [] +# def wait_for_event_id(id: int) -> EventResult: +# for record in collected_iterator: +# records.append(record) +# print(f"Collected record: {record}") +# if record.event_id == id: +# return record - def wait_for_n_records(num: int) -> list[EventResult]: - i = 0 - n_records = [] - for record in collected_iterator: - i += 1 - records.append(record) - n_records.append(record) - print(f"Collected record: {record}") - if i == num: - return n_records - - print("creating hotels") - # Wait for hotels to be created - wait_for_n_records(20) - time.sleep(10) # wait for all hotels to be registered - - dist = 5 - loc = Geo(0, 0) - event = Event(n0, {"loc": loc, "dist": dist}, df) - runtime.send(event, flush=True) +# def wait_for_n_records(num: int) -> list[EventResult]: +# i = 0 +# n_records = [] +# for record in collected_iterator: +# i += 1 +# records.append(record) +# n_records.append(record) +# print(f"Collected record: {record}") +# if i == num: +# return n_records + +# print("creating hotels") +# # Wait for hotels to be created +# wait_for_n_records(20) +# time.sleep(10) # wait for all hotels to be registered + +# dist = 5 +# loc = Geo(0, 0) +# event = Event(n0, {"loc": loc, "dist": dist}, df) +# runtime.send(event, flush=True) - nearby = [] - for hotel in hotels: - if hotel.distance(loc) < dist: - nearby.append(hotel.name) - - event_result = wait_for_event_id(event._id) - results = [r for r in event_result.result if r != None] - print(nearby) - assert set(results) == set(nearby) \ No newline at end of file +# nearby = [] +# for hotel in hotels: +# if hotel.distance(loc) < dist: +# nearby.append(hotel.name) + +# event_result = wait_for_event_id(event._id) +# results = [r for r in event_result.result if r != None] +# print(nearby) +# assert set(results) == set(nearby) \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py new file mode 100644 index 0000000..0ab097a --- /dev/null +++ b/tests/optimizations/test_parallelize.py @@ -0,0 +1,232 @@ + +import os +import sys + + +# import cascade +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) + + +from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +from dataclasses import dataclass +from typing import Any +from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, DataflowRef, Edge, Event, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode +from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator + +class Stock: + def __init__(self, item: str, quantity: int): + self.item = item + self.quantity = quantity + + def get_quantity(self): + return self.quantity + +def get_quantity_compiled_0(variable_map: dict[str, Any], state: Stock) -> Any: + return state.quantity + +stock_op = StatefulOperator( + Stock, + { + "get_quantity_compiled_0": Block(function_call=get_quantity_compiled_0, var_map_writes=[], var_map_reads=[], name="get_quantity_compiled") + }, + {}, + keyby="item" +) + +def stock_op_df(): + df = DataFlow("get_quantity", "Stock") + n0 = CallLocal(InvokeMethod("get_quantity_compiled_0")) + df.entry = [n0] + return df + +def stock_op_init_df(): + df = DataFlow("__init__", "Stock") + n0 = CallLocal(InitClass()) + df.entry = [n0] + return df + +stock_op.dataflows["get_quantity"] = stock_op_df() +stock_op.dataflows["__init__"] = stock_op_init_df() + + +class Adder: + @staticmethod + def add(a, b): + return a + b + +def add_compiled_0(variable_map: dict[str, Any]) -> Any: + return variable_map["a"] + variable_map["b"] + +adder_op = StatelessOperator( + Adder, + { + "add_compiled_0": Block(function_call=add_compiled_0, var_map_reads=["a", "b"], var_map_writes=[], name="add_compiled_0") + }, + {} +) + +def adder_df(): + df = DataFlow("add", "Adder") + n0 = CallLocal(InvokeMethod("add_compiled_0")) + df.entry = [n0] + return df + +adder_op.dataflows["add"] = adder_df() + + +class Test: + @staticmethod + def get_total(item1: Stock, item2: Stock): + x = item1.get_quantity() + y = item2.get_quantity() + total_adder = Adder.add(x, y) + total = x + y + assert total == total_adder + return total + +def get_total_compiled_0(variable_map): + total = variable_map["x"] + variable_map["y"] + assert total == variable_map["total_adder"] + return total + +def test_parallelize(): + test_op = StatelessOperator( + Test, + { + "get_total_compiled_0": Block( + function_call=get_total_compiled_0, + var_map_writes=[], + var_map_reads=["x", "y", "total_adder"], + name="get_total_compiled_0") + }, + {} + ) + + df = DataFlow("get_total", "Test") + n0 = CallEntity(DataflowRef("get_quantity", "Stock"), {"item": "item1"}, assign_result_to="x") + n1 = CallEntity(DataflowRef("get_quantity", "Stock"), {"item": "item2"}, assign_result_to="y") + n2 = CallEntity(DataflowRef("add", "Adder"), {"a": "x", "b": "y"}, assign_result_to="total_adder") + n3 = CallLocal(InvokeMethod("get_total_compiled_0")) + df.add_edge(Edge(n0, n1)) + df.add_edge(Edge(n1, n2)) + df.add_edge(Edge(n2, n3)) + + df.entry = [n0] + test_op.dataflows[df.name] = df + print(df) + print(df.nodes) + + df = parallelize(test_op.dataflows[df.name]) + df.name = "get_total_parallel" + test_op.dataflows[df.name] = df + + runtime = PythonRuntime() + runtime.add_stateless_operator(test_op) + runtime.add_stateless_operator(adder_op) + runtime.add_operator(stock_op) + runtime.run() + + client = PythonClientSync(runtime) + + event = stock_op.dataflows["__init__"].generate_event({"item": "fork", "quantity": 10}) + result = client.send(event) + + + event = stock_op.dataflows["__init__"].generate_event({"item": "spoon", "quantity": 20}) + result = client.send(event) + + event = test_op.dataflows["get_total"].generate_event({"item1": "fork", "item2": "spoon"}) + result = client.send(event) + assert result == 30 + + event = test_op.dataflows["get_total_parallel"].generate_event({"item1": "fork", "item2": "spoon"}) + result = client.send(event) + print(result) + assert result == 30 + +@dataclass +class AnnotatedNode: + node: Node + reads: list[str] + writes: list[str] + +import networkx as nx +def parallelize(df: DataFlow): + # create the dependency graph + ans = [] + # since we use SSA, every variable has exactly one node that writes it + write_nodes = {} + graph = nx.DiGraph() + for node in df.nodes.values(): + if isinstance(node, CallEntity): + reads = list(node.variable_rename.values()) + writes = [result] if (result := node.assign_result_to) else [] + elif isinstance(node, CallLocal): + method = df.get_operator().methods[node.method.method_name] + reads = method.var_map_reads + writes = method.var_map_writes + else: + raise ValueError(f"unsupported node type: {type(node)}") + + write_nodes.update({var: node.id for var in writes}) + + ans.append(AnnotatedNode(node, reads, writes)) + graph.add_node(node.id) + + nodes_with_indegree_0 = set(graph.nodes) + n_map = df.nodes + for node in ans: + for read in node.reads: + print(read) + if read in write_nodes: + # "read" will not be in write nodes if it is part of the arguments + # a more thorough implementation would not need the if check, + # and add the arguments as writes to some function entry node + graph.add_edge(write_nodes[read], node.node.id) + try: + nodes_with_indegree_0.remove(node.node.id) + except KeyError: + pass + + updated = DataFlow(df.name) + updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] + prev_node = None + print(nodes_with_indegree_0) + + while len(nodes_with_indegree_0) > 0: + # remove nodes from graph + children = [] + for node_id in nodes_with_indegree_0: + children.extend(graph.successors(node_id)) + graph.remove_node(node_id) + updated.add_node(n_map[node_id]) + + + # check for new indegree 0 nodes + next_nodes = set() + for child in children: + if graph.in_degree(child) == 0: + next_nodes.add(child) + + if len(nodes_with_indegree_0) > 1: + # TODO: maybe collect node should just infer from it's predecessors? + # like it can only have DataFlowNode predecessors + # TODO: rename DataflowNode to EntityCall + collect_node = CollectNode() + for node_id in nodes_with_indegree_0: + if prev_node: + updated.add_edge(Edge(prev_node, n_map[node_id])) + updated.add_edge(Edge(n_map[node_id], collect_node)) + prev_node = collect_node + else: + node_id = nodes_with_indegree_0.pop() + if prev_node: + updated.add_edge(Edge(prev_node, n_map[node_id])) + + prev_node = n_map[node_id] + + nodes_with_indegree_0 = next_nodes + + print(df.to_dot()) + print(updated.to_dot()) + return updated From 76e261b72487bf2e28fd055c43ae2ca64512a73f Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 1 Apr 2025 15:03:30 +0200 Subject: [PATCH 09/37] Add deathstar benchmark test for new IR --- deathstar_movie_review/entities/entities.py | 91 ++++++++++++ deathstar_movie_review/entities/frontend.py | 1 - deathstar_movie_review/entities/unique_id.py | 1 - .../test_movie_review_demo.py | 94 +++++++------ src/cascade/core.py | 41 ++++-- src/cascade/dataflow/dataflow.py | 95 ++++++++----- src/cascade/dataflow/operator.py | 11 +- .../dataflow/optimization/dead_node_elim.py | 3 + .../optimization/test_dead_node_elim.py | 2 +- src/cascade/dataflow/test_dataflow.py | 132 ------------------ .../frontend/ast_visitors/replace_name.py | 43 ++++-- .../generator/generate_split_functions.py | 7 +- .../frontend/generator/split_function.py | 10 +- src/cascade/runtime/python_runtime.py | 33 +++-- test_programs/expected/__init__.py | 0 test_programs/expected/checkout_item.py | 33 ----- test_programs/target/checkout_item.py | 23 +-- test_programs/target/operator_chaining.py | 40 ++++++ .../test_dataflow_graph_builder.py | 41 ------ .../dataflow_analysis/test_entities.py | 98 ++++++++++++- .../dataflow_analysis/test_split_functions.py | 3 +- tests/integration/flink-runtime/common.py | 4 +- tests/optimizations/test_ops.py | 25 ++++ tests/optimizations/test_parallelize.py | 127 +++-------------- tests/programs/test_programs.py | 102 +++++++++++++- 25 files changed, 602 insertions(+), 458 deletions(-) create mode 100644 deathstar_movie_review/entities/entities.py delete mode 100644 src/cascade/dataflow/test_dataflow.py delete mode 100644 test_programs/expected/__init__.py delete mode 100644 test_programs/expected/checkout_item.py create mode 100644 test_programs/target/operator_chaining.py create mode 100644 tests/optimizations/test_ops.py diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py new file mode 100644 index 0000000..f582073 --- /dev/null +++ b/deathstar_movie_review/entities/entities.py @@ -0,0 +1,91 @@ +import uuid +from cascade import cascade + +@cascade +class ComposeReview: + def __init__(self, req_id: str, *args): # *args is a temporary hack to allow for creation of composereview on the fly + self.req_id = req_id + self.review_data = {} + + def upload_unique_id(self, review_id: int): + self.review_data["review_id"] = review_id + + # could use the User class instead? + def upload_user_id(self, user_id: str): + self.review_data["userId"] = user_id + + def upload_movie_id(self, movie_id: str): + self.review_data["movieId"] = movie_id + + def upload_rating(self, rating: int): + self.review_data["rating"] = rating + + def upload_text(self, text: str): + self.review_data["text"] = text + + def get_data(self): + x = self.review_data + return x + +@cascade +class User: + def __init__(self, username: str, user_data: dict): + self.username = username + self.user_data = user_data + + def upload_user(self, review: ComposeReview): + user_id = self.user_data["userId"] + review.upload_user_id(user_id) + +@cascade +class MovieId: + # key: 'title' + def __init__(self, title: str, movie_id: str): + self.title = title + self.movie_id = movie_id + + def upload_movie(self, review: ComposeReview, rating: int): + # if self.movie_id is not None: + # review.upload_movie_id(self.movie_id) + # else: + # review.upload_rating(rating) + movie_id = self.movie_id + review.upload_movie_id(movie_id) + +@cascade +class Frontend(): + @staticmethod + def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text: str): + UniqueId.upload_unique_id_2(review) + user.upload_user(review) + title.upload_movie(review, rating) + # text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! + Text.upload_text_2(review, text) + +@cascade +class UniqueId(): + @staticmethod + def upload_unique_id_2(review: ComposeReview): + # TODO: support external libraries + # review_id = uuid.uuid1().int >> 64 + review_id = 424242 + review.upload_unique_id(review_id) + +@cascade +class Text(): + @staticmethod + def upload_text_2(review: ComposeReview, text: str): + review.upload_text(text) + + +@cascade +class Plot: + def __init__(self, movie_id: str, plot: str): + self.movie_id = movie_id + self.plot = plot + +@cascade +class MovieInfo: + def __init__(self, movie_id: str, info: dict): + self.movie_id = movie_id + self.info = info \ No newline at end of file diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py index 88a71cf..9da7e88 100644 --- a/deathstar_movie_review/entities/frontend.py +++ b/deathstar_movie_review/entities/frontend.py @@ -8,7 +8,6 @@ from deathstar_movie_review.entities.user import User from deathstar_movie_review.entities.text import Text, text_op - CHAR_LIMIT = 50 # frontend is made stateless diff --git a/deathstar_movie_review/entities/unique_id.py b/deathstar_movie_review/entities/unique_id.py index 007db32..7972857 100644 --- a/deathstar_movie_review/entities/unique_id.py +++ b/deathstar_movie_review/entities/unique_id.py @@ -4,7 +4,6 @@ from cascade.dataflow.operator import Block, StatelessOperator from deathstar_movie_review.entities.compose_review import ComposeReview - class UniqueId(): @staticmethod def upload_unique_id_2(review: ComposeReview): diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 36892ec..9cba7ba 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -1,8 +1,10 @@ import sys import os + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) +from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.dataflow.dataflow import Event, InitClass, InvokeMethod, OpNode from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime @@ -11,35 +13,36 @@ from deathstar_movie_review.entities.movie import MovieId, movie_id_op, movie_info_op, plot_op from deathstar_movie_review.entities.frontend import frontend_op, text_op, unique_id_op, frontend_df_serial +import cascade - -def test_deathstar_movie_demo_python(): - print("starting") +def init_python_runtime() -> tuple[PythonRuntime, PythonClientSync]: runtime = PythonRuntime() + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) - # make sure we're running the serial version - prev_df = frontend_op.dataflow - frontend_op.dataflow = frontend_df_serial() - + runtime.run() + return runtime, PythonClientSync(runtime) - print(frontend_op.dataflow.to_dot()) - dead_node_elimination([], [frontend_op]) - print(frontend_op.dataflow.to_dot()) +import time +def test_deathstar_movie_demo_python(): + print("starting") + cascade.core.clear() + exec(f'import deathstar_movie_review.entities.entities') + cascade.core.init() - runtime.add_operator(compose_review_op) - runtime.add_operator(user_op) - runtime.add_operator(movie_info_op) - runtime.add_operator(movie_id_op) - runtime.add_operator(plot_op) - runtime.add_stateless_operator(frontend_op) - runtime.add_stateless_operator(unique_id_op) - runtime.add_stateless_operator(text_op) + runtime, client = init_python_runtime() + user_op = cascade.core.operators["User"] + compose_op = cascade.core.operators["ComposeReview"] + movie_op = cascade.core.operators["MovieId"] + frontend_op = cascade.core.operators["Frontend"] - runtime.run() - client = PythonClientSync(runtime) + for df in cascade.core.dataflows.values(): + print(df.to_dot()) - init_user = OpNode(User, InitClass(), read_key_from="username") - username = "username_1" + username = "myUsername" user_data = { "userId": "user1", "FirstName": "firstname", @@ -48,27 +51,32 @@ def test_deathstar_movie_demo_python(): "Password": "****", "Salt": "salt" } + print("testing user create") - event = Event(init_user, {"username": username, "user_data": user_data}, None) + + event = user_op.dataflows["__init__"].generate_event({"username": username, "user_data": user_data}, username) result = client.send(event) - assert isinstance(result, User) and result.username == username + print(result) + assert result.username == username print("testing compose review") - req_id = 1 + req_id = "1" movie_title = "Cars 2" movie_id = 1 # make the review - init_compose_review = OpNode(ComposeReview, InitClass(), read_key_from="req_id") - event = Event(init_compose_review, {"req_id": req_id}, None) + event = compose_op.dataflows["__init__"].generate_event({"req_id": req_id}, req_id) result = client.send(event) print("review made") - # make the movie - init_movie = OpNode(MovieId, InitClass(), read_key_from="title") - event = Event(init_movie, {"title": movie_title, "movie_id": movie_id}, None) + + # # make the movie + # init_movie = OpNode(MovieId, InitClass(), read_key_from="title") + event = movie_op.dataflows["__init__"].generate_event({"title": movie_title, "movie_id": movie_id}, movie_title) result = client.send(event) + # event = Event(init_movie, {"title": movie_title, "movie_id": movie_id}, None) + # result = client.send(event) print("movie made") # compose the review @@ -80,11 +88,11 @@ def test_deathstar_movie_demo_python(): "text": "good movie!" } - event = Event( - frontend_op.dataflow.entry, - review_data, - frontend_op.dataflow) - result = client.send(event) + r_data = {r+"_0": v for r, v in review_data.items()} + + event = frontend_op.dataflows["compose"].generate_event(r_data) + result = client.send(event, block=False) + print(result) print("review composed") @@ -96,19 +104,19 @@ def test_deathstar_movie_demo_python(): {"req_id": req_id}, None ) + event = compose_op.dataflows["get_data"].generate_event({"req_id": req_id}, req_id) result = client.send(event) + print(result) + print(runtime.statefuloperators["ComposeReview"].states["1"].review_data) + # time.sleep(0.5) + + # result = client.send(event) expected = { "userId": user_data["userId"], "movieId": movie_id, "text": review_data["text"] } - print(result, expected) + # print(result, expected) assert "review_id" in result del result["review_id"] # randomly generated - assert result == expected - - print("Success!") - - # put the df back - frontend_op.dataflow = prev_df - \ No newline at end of file + assert result == expected \ No newline at end of file diff --git a/src/cascade/core.py b/src/cascade/core.py index 127a019..830ebad 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -4,14 +4,14 @@ from klara.core import nodes from klara.core.tree_rewriter import AstBuilder from klara.core.cfg import Cfg - +from klara.core.node_classes import Arguments from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator from cascade.wrappers import ClassWrapper from cascade.descriptors import ClassDescriptor, MethodDescriptor -from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions +from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements from cascade.frontend.generator.generate_dataflow import GenerateDataflow -from cascade.dataflow.dataflow import DataFlow, Operator +from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass, Operator from cascade.frontend.intermediate_representation import StatementDataflowGraph from cascade.frontend.generator.build_compiled_method_string import BuildCompiledMethodsString from cascade.frontend.ast_visitors import ExtractTypeVisitor @@ -28,6 +28,7 @@ def setup_cfg(code: str) -> Cfg: registered_classes: list[ClassWrapper] = [] operators: dict[str, Operator] = {} +dataflows: dict[DataflowRef, DataFlow] = {} def cascade(cls, parse_file=True): if not isclass(cls): @@ -54,12 +55,8 @@ def cascade(cls, parse_file=True): registered_classes.append(class_wrapper) - -def build(method) -> tuple[DataFlow, list[Block]]: - # TODO: implement - pass - def init(): + # First pass: register operators/classes for cls in registered_classes: op_name = cls.class_desc.class_name @@ -72,13 +69,35 @@ def init(): # generate split functions for method in cls.class_desc.methods_dec: - method.build_dataflow() - df, blocks = build(method) + df_ref = DataflowRef(op_name, method.method_name) + # Add version number manually + args = [f"{str(arg)}_0" for arg in method.method_node.args.args] + # TODO: cleaner solution that checks if the function is stateful or not + if args[0] == "self_0": + args = args[1:] + dataflows[df_ref] = DataFlow(method.method_name, op_name, args) + + operators[op_name] = op + + # Second pass: build dataflows + for cls in registered_classes: + op_name = cls.class_desc.class_name + op = operators[op_name] + + # generate split functions + for method in cls.class_desc.methods_dec: + if method.method_name == "__init__": + df = DataFlow("__init__", op_name) + n0 = CallLocal(InitClass()) + df.entry = [n0] + blocks = [] + else: + df, blocks = GroupStatements(method.method_node).build(dataflows, op_name) + op.dataflows[df.name] = df for b in blocks: op.methods[b.name] = b - operators[op_name] = op diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index ed42f38..0afb1ef 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING import uuid +import cascade if TYPE_CHECKING: # Prevent circular imports @@ -154,8 +155,7 @@ class DataflowRef: dataflow_name: str def get_dataflow(self) -> 'DataFlow': - operator: Operator = cascade.operators[operator_name] - return operator.dataflows[self.dataflow_name] + return cascade.core.dataflows[self] def __repr__(self) -> str: return f"{self.operator_name}.{self.dataflow_name}" @@ -176,28 +176,40 @@ class CallEntity(Node): assign_result_to: Optional[str] = None """What variable to assign the result of this node to, if any.""" - key: Optional[str] = None + keyby: Optional[str] = None """The key, for calls to Stateful Entities""" def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['Event']: # remap the variable map of event into the new event new_var_map = {key: event.variable_map[value] for key, value in self.variable_rename.items()} - + if self.keyby: + new_key = event.variable_map[self.keyby] + else: + new_key = None df = self.dataflow.get_dataflow() new_targets = df.entry if not isinstance(new_targets, list): new_targets = [new_targets] - # targets: the list of targets to go to after this dataflow node - call = CallStackItem(event.dataflow, self.assign_result_to, event.variable_map, targets) - event.call_stack.append(call) + # Tail call elimination: + # "targets" corresponds to where to go after this CallEntity finishes + # the call to self.dataflow + # + # If this CallEntity is a terminal node in event.dataflow, then we don't + # need to go back to event.dataflow, so we don't add it to the call stack. + # This node is terminal in event.dataflow iff len(targets) == 0 + if len(targets) > 0: + call = CallStackItem(event.dataflow, self.assign_result_to, event.variable_map, targets, key=event.key) + event.call_stack.append(call) + return [Event( target, new_var_map, df, _id=event._id, metadata=event.metadata, - call_stack=event.call_stack) + call_stack=event.call_stack, + key=new_key) for target in new_targets] @@ -209,14 +221,19 @@ class CallLocal(Node): def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: # For simple calls, we only need to change the target. # Multiple targets results in multiple events - return [Event( + events = [] + for target in targets: + ev = Event( target, event.variable_map, event.dataflow, + call_stack=event.call_stack, _id=event._id, - metadata=event.metadata) - - for target in targets] + metadata=event.metadata, + key=event.key) + + events.append(ev) + return events @dataclass class CollectNode(Node): @@ -238,7 +255,8 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) _id=event._id, call_stack=event.call_stack, # collect_target=ct, - metadata=event.metadata) + metadata=event.metadata, + key=event.key) for target in targets] @@ -283,7 +301,7 @@ def __init__(self, name: str, op_name: str=None, args: list[str]=None): self.args = args def get_operator(self) -> Operator: - return cascade.ops[self.op_name] + return cascade.core.operators[self.op_name] def add_node(self, node: Node): """Add a node to the Dataflow graph if it doesn't already exist.""" @@ -368,7 +386,7 @@ def get_predecessors(self, node: Node) -> List[Node]: def to_dot(self) -> str: """Output the DataFlow graph in DOT (Graphviz) format.""" - lines = [f"digraph {self.name} {{"] + lines = [f"digraph {self.op_name}.{self.name} {{"] # Add nodes for node in self.nodes.values(): @@ -382,18 +400,18 @@ def to_dot(self) -> str: lines.append("}") return "\n".join(lines) - def generate_event(self, variable_map: dict[str, Any]) -> Union['Event', list['Event']]: + def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> Union['Event', list['Event']]: if isinstance(self.entry, list): assert len(self.entry) != 0 # give all the events the same id - first_event = Event(self.entry[0], variable_map, self) + first_event = Event(self.entry[0], variable_map, self, key=key) id = first_event._id - return [first_event] + [Event(entry, variable_map, self, _id=id) for entry in self.entry[1:]] + return [first_event] + [Event(entry, variable_map, self, _id=id, key=key) for entry in self.entry[1:]] else: - return Event(self.entry, variable_map, self) + return Event(self.entry, variable_map, self, key=key) def __repr__(self) -> str: - return f"{self.op.name()}.{self.name}" + return f"{self.op_name}.{self.name}" @dataclass class CollectTarget: @@ -418,6 +436,8 @@ class CallStackItem: var_map: dict[str, str] """Variables are saved in the call stack""" targets: Union[Node, List[Node]] + key: Optional[str] = None + """The key to use when coming back""" @dataclass class Event(): @@ -445,6 +465,9 @@ class Event(): metadata: dict = field(default_factory=metadata_dict) """Event metadata containing, for example, timestamps for benchmarking""" + + key: Optional[str] = None + """If on a Stateful Operator, the key of the state""" def __post_init__(self): if self._id is None: @@ -453,10 +476,9 @@ def __post_init__(self): def propogate(self, result: Any) -> Union['EventResult', list['Event']]: """Propogate this event through the Dataflow.""" - targets = self.dataflow.get_neighbors(self.target) - if len(targets) == 0: + if len(targets) == 0 and not isinstance(self.target, CallEntity): if len(self.call_stack) > 0: caller = self.call_stack.pop() @@ -467,25 +489,30 @@ def propogate(self, result: Any) -> Union['EventResult', list['Event']]: var_map = caller.var_map if (x := caller.assign_result_to): var_map[x] = result - - return [Event( - target, - var_map, - new_df, - _id=self._id, - metadata=self.metadata, + + events = [] + + for target in new_targets: + ev = Event( + target, + var_map, + new_df, + _id=self._id, + call_stack=self.call_stack, + metadata=self.metadata, + key=caller.key ) + events.append(ev) - for target in new_targets] + return events - else: + else: return EventResult(self._id, result, self.metadata) else: current_node = self.target - - - return current_node.propogate(self, targets, result) + new = current_node.propogate(self, targets, result) + return new @dataclass class EventResult(): diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index 6307834..fa8074f 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, Union -from cascade.dataflow.dataflow import DataFlow, InvokeMethod, Operator +from cascade.dataflow.dataflow import CallLocal, DataFlow, InitClass, InvokeMethod, Operator T = TypeVar('T') @@ -56,8 +56,7 @@ class StatefulOperator(Generic[T], Operator): methods, instead reading and modifying the underlying class `T` through a state variable, see `handle_invoke_method`. """ - # TODO: keyby should not be optional - def __init__(self, entity: Type[T], methods: dict[str, Block], dataflows: dict[str, DataFlow], keyby: str=""): + def __init__(self, entity: Type[T], methods: dict[str, Block], dataflows: dict[str, DataFlow]): """Create the StatefulOperator from a class and its compiled methods. Typically, a class could be comprised of split and non-split methods. Take the following example: @@ -110,9 +109,7 @@ def user_buy_item_1(variable_map: dict[str, Any], state: User): self.methods = methods self.entity = entity self.dataflows = dataflows - self.keyby = keyby """A mapping from method names to DataFlows""" - def handle_init_class(self, *args, **kwargs) -> T: """Create an instance of the underlying class. Equivalent to `T.__init__(*args, **kwargs)`.""" @@ -127,7 +124,7 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ return self.methods[method.method_name].call(variable_map=variable_map, state=state) - + def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) @@ -152,7 +149,7 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self.methods[method.method_name].call(variable_map=variable_map) + return self.methods[method.method_name].call(variable_map=variable_map, state=None) def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) diff --git a/src/cascade/dataflow/optimization/dead_node_elim.py b/src/cascade/dataflow/optimization/dead_node_elim.py index d1a9d06..414bd6d 100644 --- a/src/cascade/dataflow/optimization/dead_node_elim.py +++ b/src/cascade/dataflow/optimization/dead_node_elim.py @@ -18,7 +18,10 @@ def is_no_op(func): return body in ("pass", "return") +# DEPRECATED as dead nodes are not commonly generated. +# However, some logic could be done for "flattening" calls in calls def dead_node_elimination(stateful_ops: list[StatefulOperator], stateless_ops: list[StatelessOperator]): + # Find dead functions dead_func_names = set() for op in stateful_ops: diff --git a/src/cascade/dataflow/optimization/test_dead_node_elim.py b/src/cascade/dataflow/optimization/test_dead_node_elim.py index 94b30af..87f03c4 100644 --- a/src/cascade/dataflow/optimization/test_dead_node_elim.py +++ b/src/cascade/dataflow/optimization/test_dead_node_elim.py @@ -69,7 +69,7 @@ def user_order_df(): df = user_order_df() user_op.dataflows[df.name] = df -def test_dead_node_elim(): +def DEPRECATED_test_dead_node_elim(): print(user_op.dataflows[df.name].to_dot()) dead_node_elimination([user_op], []) diff --git a/src/cascade/dataflow/test_dataflow.py b/src/cascade/dataflow/test_dataflow.py deleted file mode 100644 index a5b42af..0000000 --- a/src/cascade/dataflow/test_dataflow.py +++ /dev/null @@ -1,132 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, Event, EventResult, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator - -class DummyUser: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance - - def buy_item(self, item: 'DummyItem') -> bool: - item_price = item.get_price() # SSA - self.balance -= item_price - return self.balance >= 0 - -def buy_item_0_compiled(variable_map: dict[str, Any], state: DummyUser): - return - -def buy_item_1_compiled(variable_map: dict[str, Any], state: DummyUser): - state.balance -= variable_map["item_price"] - return state.balance >= 0 - -class DummyItem: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price - - def get_price(self) -> int: - return self.price - -def get_price_compiled(variable_map: dict[str, Any], state: DummyItem): - return state.price - -################## TESTS ####################### - -user = DummyUser("user", 100) -item = DummyItem("fork", 5) - -user_sop = StatefulOperator(DummyUser, - {"buy_item_0": buy_item_0_compiled, - "buy_item_1": buy_item_1_compiled}, None) - - -def test_simple_df_propogation(): - df = DataFlow("user.buy_item") - n1 = OpNode(DummyUser, InvokeMethod("buy_item_0_compiled"), read_key_from="user_key") - n2 = OpNode(DummyItem, InvokeMethod("get_price"), read_key_from="item_key", assign_result_to="item_price") - n3 = OpNode(DummyUser, InvokeMethod("buy_item_1"), read_key_from="user_key") - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - - user.buy_item(item) - event = Event(n1, {"user_key": "user", "item_key":"fork"}, df) - - # Manually propogate - item_key = buy_item_0_compiled(event.variable_map, state=user) - next_event = event.propogate(event, item_key) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n2 - event = next_event[0] - - # manually add the price to the variable map - item_price = get_price_compiled(event.variable_map, state=item) - assert n2.assign_result_to - event.variable_map[n2.assign_result_to] = item_price - - next_event = event.propogate(item_price) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n3 - event = next_event[0] - - positive_balance = buy_item_1_compiled(event.variable_map, state=user) - next_event = event.propogate(None) - assert isinstance(next_event, EventResult) - - -def test_merge_df_propogation(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(DummyUser, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n3 = CollectNode(assign_result_to="item_prices", read_results_from="item_price") - n1 = OpNode( - DummyItem, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 0), - read_key_from="item_1_key" - ) - n2 = OpNode( - DummyItem, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 1), - read_key_from="item_2_key" - ) - n4 = OpNode(DummyUser, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n3)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - - # User with key "foo" buys items with keys "fork" and "spoon" - event = Event(n0, {"user_key": "foo", "item_1_key": "fork", "item_2_key": "spoon"}, df) - - # Propogate the event (without actually doing any calculation) - # Normally, the key_stack should've been updated by the runtime here: - next_event = event.propogate(None) - - assert isinstance(next_event, list) - assert len(next_event) == 2 - assert next_event[0].target == n1 - assert next_event[1].target == n2 - - event1, event2 = next_event - next_event = event1.propogate(None) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n3 - - next_event = event2.propogate(None) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n3 - - final_event = next_event[0].propogate(None) - assert isinstance(final_event, list) - assert final_event[0].target == n4 diff --git a/src/cascade/frontend/ast_visitors/replace_name.py b/src/cascade/frontend/ast_visitors/replace_name.py index 26061ea..c98bd85 100644 --- a/src/cascade/frontend/ast_visitors/replace_name.py +++ b/src/cascade/frontend/ast_visitors/replace_name.py @@ -1,21 +1,42 @@ from klara.core.ssa_visitors import AstVisitor from klara.core import nodes -class ReplaceName(AstVisitor): - """get all variables (ast.name) from given node, separate by targets and values - +class ReplaceSelfWithState(AstVisitor): + """Replace attributes with "self" into "state", and remove SSA versioning. + + e.g.: + self_0.balance_0 -> state.balance """ - def __init__(self, target: str, new: str): - self.target: str = target - self.new: str = new + def __init__(self): + self.target: str = "self" + self.new: str = "state" @classmethod - def replace(cls, node, target: str, new: str): - c = cls(target, new) + def replace(cls, node): + c = cls() c.visit(node) return c - def visit_name(self, node: nodes.Name): - if node.id == self.target: - node.id = self.new \ No newline at end of file + def replace_name(self, node: nodes.Name): + node.id = self.new + node.version = -1 + + def visit_subscript(self, node: nodes.Subscript): + # e.g. self_0.data["something"]_0 -> state.data["something"] + if isinstance(node.value, nodes.Attribute): + name = node.value.value + if str(name) == self.target: + self.replace_name(name) + node.version = -1 + + def visit_assignattribute(self, node: nodes.AssignAttribute): + if str(node.value) == self.target : + self.replace_name(node.value) + node.version = -1 + + + def visit_attribute(self, node: nodes.Attribute): + if str(node.value) == self.target: + self.replace_name(node.value) + node.version = -1 diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/generate_split_functions.py index 2580081..eb2f94d 100644 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ b/src/cascade/frontend/generator/generate_split_functions.py @@ -81,7 +81,6 @@ def split_function(self, G: nx.DiGraph) -> tuple[list[Statement], list[Statement def no_remote_dependencies_on_path(self, G: nx.DiGraph, source: Statement, target: Statement) -> bool: - print(source, target) for path in self.get_all_simple_paths(G, source, target): for n in path: if n not in [source, target] and n.is_remote(): @@ -171,12 +170,11 @@ def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> tuple[D blocks = [] block_num = 0 - args = self.function_def.args - df = DataFlow("name", "op_name", args) + df_ref = DataflowRef(op_name, self.cfg.name) + df = dataflows[df_ref] last_node = None for split in self._grouped_statements: - print(split) if len(split) == 1 and split[0].is_remote(): # Entity call node = to_entity_call(split[0], self.type_map, dataflows) @@ -191,6 +189,7 @@ def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> tuple[D if last_node == None: last_node = node df.add_node(node) + df.entry = [node] else: df.add_edge(Edge(last_node, node)) last_node = node diff --git a/src/cascade/frontend/generator/split_function.py b/src/cascade/frontend/generator/split_function.py index 021e59c..e0618d5 100644 --- a/src/cascade/frontend/generator/split_function.py +++ b/src/cascade/frontend/generator/split_function.py @@ -7,7 +7,7 @@ from cascade.dataflow.operator import Block from cascade.frontend.util import to_camel_case from cascade.frontend.intermediate_representation import Statement -from cascade.frontend.ast_visitors.replace_name import ReplaceName +from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState from cascade.frontend.generator.unparser import unparse from cascade.frontend.generator.remote_call import RemoteCall @@ -56,7 +56,7 @@ def body_to_string(self) -> str: block: RawBasicBlock = statement.block if type(block) == nodes.FunctionDef: continue - ReplaceName.replace(block, 'self', 'state') + ReplaceSelfWithState.replace(block) if type(block) == nodes.Return: body.insert(0,'key_stack.pop()') @@ -96,7 +96,7 @@ def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: di if operator_var in type_map: operator_name = type_map[operator_var] - key = str(statement.attribute.value) + key = repr(statement.attribute.value) else: # assume stateless operator operator_name = operator_var @@ -108,7 +108,7 @@ def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: di args.remove(operator_var) df_args = dataflows[dataflow].args - return CallEntity(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,key=key) + return CallEntity(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,keyby=key) class SplitFunction2: @@ -180,7 +180,7 @@ def body_to_string(self) -> str: block: RawBasicBlock = statement.block if type(block) == nodes.FunctionDef: continue - ReplaceName.replace(block, 'self', 'state') + ReplaceSelfWithState.replace(block) body.append(unparse(block)) diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index 7a73ed5..ac12106 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -1,10 +1,11 @@ -from logging import Filter import threading -from typing import List, Type, Union +from typing import List, Union from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod, OpNode, StatelessOpNode from queue import Empty, Queue +import time + class PythonStatefulOperator(): def __init__(self, operator: StatefulOperator): self.operator = operator @@ -12,9 +13,8 @@ def __init__(self, operator: StatefulOperator): def process(self, event: Event): assert(isinstance(event.target, CallLocal)) - assert(isinstance(event.dataflow.op, StatefulOperator)) - key = event.variable_map[event.dataflow.op.keyby] + key = event.key print(f"PythonStatefulOperator[{self.operator.entity.__name__}[{key}]]: {event}") @@ -44,7 +44,7 @@ def __init__(self, operator: StatelessOperator): def process(self, event: Event): assert(isinstance(event.target, CallLocal)) - print(f"PythonStatelessOperator[{self.operator.dataflow.name}]: {event}") + print(f"PythonStatelessOperator[{self.operator.name()}]: {event}") if isinstance(event.target.method, InvokeMethod): result = self.operator.handle_invoke_method( @@ -111,16 +111,22 @@ def init(self): pass def _consume_events(self): + try: + self._run() + except Exception as e: + self.running = False + raise e + + def _run(self): self.running = True def consume_event(event: Event): if isinstance(event.target, CallLocal): - if isinstance(event.dataflow.op, StatefulOperator): - yield from self.statefuloperators[event.dataflow.op.name()].process(event) + if event.dataflow.op_name in self.statefuloperators: + yield from self.statefuloperators[event.dataflow.op_name].process(event) else: - yield from self.statelessoperators[event.dataflow.op.name()].process(event) + yield from self.statelessoperators[event.dataflow.op_name].process(event) elif isinstance(event.target, CallEntity): new_events = event.propogate(None) - print(new_events) if isinstance(new_events, EventResult): yield new_events else: @@ -142,7 +148,6 @@ def consume_event(event: Event): for ev in consume_event(event): if isinstance(ev, EventResult): - print(ev) self.results.put(ev) elif isinstance(ev, Event): events.append(ev) @@ -171,6 +176,7 @@ def __init__(self, runtime: PythonRuntime): self._results_q = runtime.results self._events = runtime.events self.results = {} + self.runtime = runtime def send(self, event: Union[Event, List[Event]], block=True): if isinstance(event, list): @@ -181,8 +187,11 @@ def send(self, event: Union[Event, List[Event]], block=True): self._events.put(event) id = event._id - while block: - er: EventResult = self._results_q.get(block=True) + while block and self.runtime.running: + try: + er: EventResult = self._results_q.get(block=False, timeout=0.1) + except Empty: + continue if id == er.event_id: self.results[er.event_id] = er.result return er.result diff --git a/test_programs/expected/__init__.py b/test_programs/expected/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_programs/expected/checkout_item.py b/test_programs/expected/checkout_item.py deleted file mode 100644 index 4ff2828..0000000 --- a/test_programs/expected/checkout_item.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Any - -from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, Edge, InvokeMethod, OpNode -from test_programs.target.checkout_item import User, Item - -def buy_item_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - - -def buy_item_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance -= variable_map['item_price_0'] - return state.balance >= 0 - - -def get_price_0_compiled(variable_map: dict[str, Any], state: Item) -> Any: - return state.price - -def item_get_price_df(): - df = DataFlow("item.get_price") - n0 = CallLocal(InvokeMethod("get_price_0_compiled")) - df.entry = n0 - return df - -def user_buy_item_df(): - df = DataFlow("user.buy_item") - n0 = CallLocal(InvokeMethod("buy_item_0_compiled")) - n1 = CallEntity(item_get_price_df(), {}, "item_price_0") - n2 = CallLocal(InvokeMethod("buy_item_1_compiled")) - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.entry = n0 - return df - diff --git a/test_programs/target/checkout_item.py b/test_programs/target/checkout_item.py index 4bbca40..becb3fc 100644 --- a/test_programs/target/checkout_item.py +++ b/test_programs/target/checkout_item.py @@ -2,20 +2,27 @@ @cascade.cascade class User: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance + def __init__(self, username: str, balance: int): + self.username = username + self.balance = balance def buy_item(self, item: 'Item') -> bool: item_price = item.get_price() # SSA - self.balance -= item_price + self.balance = self.balance - item_price return self.balance >= 0 + + def __key__(self) -> str: + return self.username @cascade.cascade class Item: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price + def __init__(self, item_name: str, price: int): + self.item_name = item_name + self.price = price def get_price(self) -> int: - return self.price \ No newline at end of file + return self.price + + def __key__(self) -> str: + return self.item_name + \ No newline at end of file diff --git a/test_programs/target/operator_chaining.py b/test_programs/target/operator_chaining.py new file mode 100644 index 0000000..4001007 --- /dev/null +++ b/test_programs/target/operator_chaining.py @@ -0,0 +1,40 @@ +import cascade + +@cascade.cascade +class C: + def __init__(self, key: str): + self.key = key + + def get(self, y: int) -> int: + test = 42 + y + return test + + def __key__(self) -> str: + return self.key + +@cascade.cascade +class B: + def __init__(self, key: str): + self.key = key + + def call_c(self, c: C) -> int: + y = 0 + x = c.get(y) + return x + + def __key__(self) -> str: + return self.key + +@cascade.cascade +class A: + def __init__(self, key: str): + self.key = key + + def call_c_thru_b(self, b: B, c: C) -> int: + x = b.call_c(c) + return x*2 + + def __key__(self) -> str: + return self.key + + \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index be98e13..44cfa6f 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -8,47 +8,6 @@ from cascade.frontend.util import setup_cfg -def get_statment(df: StatementDataflowGraph, v: nodes.Statement): - return next(s for s in df.graph.nodes if s.block == v) - - -def edge_exists_between(df: StatementDataflowGraph, v: nodes.Statement, n: nodes.Statement): - statement_v: Statement = get_statment(df, v) - statement_n: Statement = get_statment(df, n) - assert (statement_v, statement_n) in df.graph.edges - -def assert_expected_edges(df, expected_edges): - edges: list[nodes.Statement] = [(n.block, v.block) for n,v in df.graph.edges] - assert edges == expected_edges - -# TODO: FOr instance in the example below there is a indirect dependency between update balence and -# returning the balence >= 0. (side effect dependency) -def test_simple_dataflow_graph(): - program: str = dedent(""" - class User: - - def buy_item(self, item: 'Item') -> bool: - item_price = item.get_price() - self.balance -= item_price - return self.balance >= 0 - """) - cfg: Cfg = setup_cfg(program) - blocks = cfg.block_list - user_class: nodes.Block = blocks[2] - buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] - buy_item_body_0 = buy_item.body[0] - buy_item_body_1 = buy_item.body[1] - buy_item_body_2 = buy_item.body[2] - df: StatementDataflowGraph = DataflowGraphBuilder.build([buy_item] + buy_item.body) - expected_edges = [ - (buy_item, buy_item_body_0), - (buy_item, buy_item_body_1), - (buy_item, buy_item_body_2), - (buy_item_body_0, buy_item_body_1) - ] - assert_expected_edges(df, expected_edges) - - def test_ssa(): program: str = dedent(""" class Test: diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index a363f22..356f99d 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from textwrap import dedent import networkx as nx @@ -5,7 +6,7 @@ from klara.core.cfg import Cfg from klara.core import nodes -from cascade.dataflow.dataflow import DataFlow, DataflowRef +from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements @@ -35,8 +36,21 @@ def get_total(item1: Stock, item2: Stock): } df, blocks = sf.build(dataflows, "Test") - print(df.to_dot()) - print(blocks) + + ## TODO: check blocks/df + assert len(df.nodes) == 3 + assert len(df.entry) == 1 + entry = df.entry[0] + assert isinstance(entry, CallEntity) + next = df.get_neighbors(entry) + assert len(next) == 1 + next = next[0] + assert isinstance(next, CallEntity) + next = df.get_neighbors(next) + assert len(next) == 1 + next = next[0] + assert isinstance(next, CallLocal) + def test_simple_block(): program: str = dedent(""" @@ -59,3 +73,81 @@ def add(x: int, y: int): assert len(blocks) == 1 assert blocks[0].call({"x_0": 3, "y_0":5 }, None) == 8 + + +def test_state(): + program = dedent(""" +class User: + def buy_item(self, item: 'Item') -> bool: + item_price = item.get_price() # SSA + self.balance = self.balance - item_price + return self.balance >= 0 +""") + + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + user_class: nodes.Block = blocks[2] + buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] + + sf = GroupStatements(buy_item) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df, blocks = sf.build(dataflows, "User") + + assert len(blocks) == 1 + + + @dataclass + class User: + username: str + balance: int + + func = blocks[0].call + + user = User("a", 20) + func({"item_price_0": 10}, user) + assert user.balance == 10 + + func({"item_price_0": 13}, user) + assert user.balance == -3 + +def test_dict_state(): + program = dedent(""" +class ComposeReview: + def upload_unique_id(self, review_id: int): + self.review_data["review_id"] = review_id +""") + + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + user_class: nodes.Block = blocks[2] + upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] + + sf = GroupStatements(upload_unique) + + dataflows = { + DataflowRef("ComposeReview", "upload_unique_id"): DataFlow("upload_unique_id", "ComposeReview", ["review_id"]), + DataflowRef("ComposeReview", "__init__"): DataFlow("__init__", "ComposeReview", ["req_id"]), + } + + df, blocks = sf.build(dataflows, "ComposeReview") + + assert len(blocks) == 1 + + + @dataclass + class User: + req_id: str + review_data: dict + + func = blocks[0].call + + print(blocks[0].raw_method_string) + + user = User("req", {}) + func({"review_id_0": 123}, user) + assert user.review_data["review_id"] == 123 diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index 961e580..b5e4a47 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -37,7 +37,8 @@ def get_total(item1: Stock, item2: Stock, y: int): dataflows = { DataflowRef("Adder", "add"): DataFlow("add", "Adder", ["a", "b"]), - DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Item", []) + DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Item", []), + DataflowRef("Test", "get_total"): DataFlow("get_total", "Test", []) } diff --git a/tests/integration/flink-runtime/common.py b/tests/integration/flink-runtime/common.py index a7d7af6..c53dd99 100644 --- a/tests/integration/flink-runtime/common.py +++ b/tests/integration/flink-runtime/common.py @@ -75,10 +75,10 @@ def buy_2_items_1_compiled(variable_map: dict[str, Any], state: User) -> Any: "buy_2_items_0": buy_2_items_0_compiled, "buy_2_items_1": buy_2_items_1_compiled }, - None) + {}) item_op = StatefulOperator( - Item, {"get_price": get_price_compiled}, None + Item, {"get_price": get_price_compiled}, {} ) diff --git a/tests/optimizations/test_ops.py b/tests/optimizations/test_ops.py new file mode 100644 index 0000000..d325f6e --- /dev/null +++ b/tests/optimizations/test_ops.py @@ -0,0 +1,25 @@ +from cascade import cascade + +@cascade +class Stock: + def __init__(self, item: str, quantity: int): + self.item = item + self.quantity = quantity + + def get_quantity(self): + return self.quantity + +@cascade +class Adder: + @staticmethod + def add(a, b): + return a + b + +@cascade +class Test: + @staticmethod + def get_total(item1: Stock, item2: Stock): + x = item1.get_quantity() + y = item2.get_quantity() + total_adder = Adder.add(x, y) + return total_adder \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index 0ab097a..e429610 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -13,106 +13,23 @@ from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, DataflowRef, Edge, Event, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator -class Stock: - def __init__(self, item: str, quantity: int): - self.item = item - self.quantity = quantity - - def get_quantity(self): - return self.quantity - -def get_quantity_compiled_0(variable_map: dict[str, Any], state: Stock) -> Any: - return state.quantity - -stock_op = StatefulOperator( - Stock, - { - "get_quantity_compiled_0": Block(function_call=get_quantity_compiled_0, var_map_writes=[], var_map_reads=[], name="get_quantity_compiled") - }, - {}, - keyby="item" -) - -def stock_op_df(): - df = DataFlow("get_quantity", "Stock") - n0 = CallLocal(InvokeMethod("get_quantity_compiled_0")) - df.entry = [n0] - return df - -def stock_op_init_df(): - df = DataFlow("__init__", "Stock") - n0 = CallLocal(InitClass()) - df.entry = [n0] - return df - -stock_op.dataflows["get_quantity"] = stock_op_df() -stock_op.dataflows["__init__"] = stock_op_init_df() - - -class Adder: - @staticmethod - def add(a, b): - return a + b - -def add_compiled_0(variable_map: dict[str, Any]) -> Any: - return variable_map["a"] + variable_map["b"] - -adder_op = StatelessOperator( - Adder, - { - "add_compiled_0": Block(function_call=add_compiled_0, var_map_reads=["a", "b"], var_map_writes=[], name="add_compiled_0") - }, - {} -) - -def adder_df(): - df = DataFlow("add", "Adder") - n0 = CallLocal(InvokeMethod("add_compiled_0")) - df.entry = [n0] - return df - -adder_op.dataflows["add"] = adder_df() - - -class Test: - @staticmethod - def get_total(item1: Stock, item2: Stock): - x = item1.get_quantity() - y = item2.get_quantity() - total_adder = Adder.add(x, y) - total = x + y - assert total == total_adder - return total - -def get_total_compiled_0(variable_map): - total = variable_map["x"] + variable_map["y"] - assert total == variable_map["total_adder"] - return total +import cascade def test_parallelize(): - test_op = StatelessOperator( - Test, - { - "get_total_compiled_0": Block( - function_call=get_total_compiled_0, - var_map_writes=[], - var_map_reads=["x", "y", "total_adder"], - name="get_total_compiled_0") - }, - {} - ) - - df = DataFlow("get_total", "Test") - n0 = CallEntity(DataflowRef("get_quantity", "Stock"), {"item": "item1"}, assign_result_to="x") - n1 = CallEntity(DataflowRef("get_quantity", "Stock"), {"item": "item2"}, assign_result_to="y") - n2 = CallEntity(DataflowRef("add", "Adder"), {"a": "x", "b": "y"}, assign_result_to="total_adder") - n3 = CallLocal(InvokeMethod("get_total_compiled_0")) - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - - df.entry = [n0] - test_op.dataflows[df.name] = df + cascade.core.clear() # clear cascadeds registerd classes. + assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ + Module" + # import the module + import_module_name: str = 'test_ops' + exec(f'import tests.optimizations.{import_module_name}') + + cascade.core.init() + + print(cascade.core.operators) + test_op = cascade.core.operators["Test"] + adder_op = cascade.core.operators["Adder"] + stock_op = cascade.core.operators["Stock"] + df = test_op.dataflows["get_total"] print(df) print(df.nodes) @@ -120,6 +37,9 @@ def test_parallelize(): df.name = "get_total_parallel" test_op.dataflows[df.name] = df + assert len(test_op.dataflows["get_total_parallel"].entry) == 2 + assert len(test_op.dataflows["get_total"].entry) == 1 + runtime = PythonRuntime() runtime.add_stateless_operator(test_op) runtime.add_stateless_operator(adder_op) @@ -128,20 +48,19 @@ def test_parallelize(): client = PythonClientSync(runtime) - event = stock_op.dataflows["__init__"].generate_event({"item": "fork", "quantity": 10}) + event = stock_op.dataflows["__init__"].generate_event({"item": "fork", "quantity": 10}, key="fork") result = client.send(event) - event = stock_op.dataflows["__init__"].generate_event({"item": "spoon", "quantity": 20}) + event = stock_op.dataflows["__init__"].generate_event({"item": "spoon", "quantity": 20}, key="spoon") result = client.send(event) - event = test_op.dataflows["get_total"].generate_event({"item1": "fork", "item2": "spoon"}) + event = test_op.dataflows["get_total"].generate_event({"item1_0": "fork", "item2_0": "spoon"}) result = client.send(event) assert result == 30 - event = test_op.dataflows["get_total_parallel"].generate_event({"item1": "fork", "item2": "spoon"}) + event = test_op.dataflows["get_total_parallel"].generate_event({"item1_0": "fork", "item2_0": "spoon"}) result = client.send(event) - print(result) assert result == 30 @dataclass @@ -188,7 +107,7 @@ def parallelize(df: DataFlow): except KeyError: pass - updated = DataFlow(df.name) + updated = DataFlow(df.name, df.op_name) updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] prev_node = None print(nodes_with_indegree_0) diff --git a/tests/programs/test_programs.py b/tests/programs/test_programs.py index 120597e..d5f25a3 100644 --- a/tests/programs/test_programs.py +++ b/tests/programs/test_programs.py @@ -5,6 +5,9 @@ import sys +from cascade.dataflow.dataflow import Event +from cascade.dataflow.operator import StatefulOperator, StatelessOperator +from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime from tests.programs.util import compare_targets_with_expected @@ -18,8 +21,9 @@ def get_target_file_list(): target_files: list[str] = get_target_file_list() -@pytest.mark.parametrize("file_name", target_files) -def test_target_programs(file_name: str): +# @pytest.mark.parametrize("file_name", target_files) +def test_checkout_item(): + file_name = "checkout_item.py" for key in list(sys.modules.keys()): if key.startswith("test_programs"): del sys.modules[key] @@ -33,5 +37,95 @@ def test_target_programs(file_name: str): cascade.core.init() assert cascade.core.registered_classes, "The Cascade module classes should be registered at this point." - methods: str = cascade.core.get_compiled_methods() - compare_targets_with_expected(file_name, methods, expected_program_relative_path) + + for op in cascade.core.operators.values(): + print(op.methods) + + runtime, client = init_python_runtime() + item_op = cascade.core.operators["Item"] + user_op = cascade.core.operators["User"] + event = item_op.dataflows["__init__"].generate_event({"item_name": "fork", "price": 10}, key="fork") + result = client.send(event) + assert result.price == 10 + assert result.item_name == "fork" + + event = item_op.dataflows["__init__"].generate_event({"item_name": "spoon", "price": 20}, key="spoon") + result = client.send(event) + assert result.price == 20 + assert result.__key__() == "spoon" + + event = user_op.dataflows["__init__"].generate_event({"username": "test", "balance": 15}, key="test") + user = client.send(event) + assert user.balance == 15 + assert user.__key__() == "test" + + event = user_op.dataflows["buy_item"].generate_event({"item_0": "fork"}, key=user.__key__()) + result = client.send(event) + assert runtime.statefuloperators["User"].states["test"].balance == 5 + assert result + + event = user_op.dataflows["buy_item"].generate_event({"item_0": "spoon"}, key=user.__key__()) + result = client.send(event) + assert runtime.statefuloperators["User"].states["test"].balance == -15 + assert not result + + +def test_operator_chaining(): + file_name = "operator_chaining.py" + for key in list(sys.modules.keys()): + if key.startswith("test_programs"): + del sys.modules[key] + + cascade.core.clear() + import_module_name: str = f'test_programs.target.{file_name.strip(".py")}' + exec(f'import {import_module_name}') + cascade.core.init() + + for op in cascade.core.operators.values(): + print(op.methods) + + for df in cascade.core.dataflows.values(): + print(df.to_dot()) + + runtime, client = init_python_runtime() + a_op = cascade.core.operators["A"] + b_op = cascade.core.operators["B"] + c_op = cascade.core.operators["C"] + + event = a_op.dataflows["__init__"].generate_event({"key": "aaa"}, key="aaa") + result = client.send(event) + assert result.key == "aaa" + + event = b_op.dataflows["__init__"].generate_event({"key": "bbb"}, key="bbb") + result = client.send(event) + assert result.key == "bbb" + + event = c_op.dataflows["__init__"].generate_event({"key": "ccc"}, key="ccc") + result = client.send(event) + assert result.key == "ccc" + + event = c_op.dataflows["get"].generate_event({"y_0": 0}, key="ccc") + result = client.send(event) + assert result == 42 + + print("Call C") + event = b_op.dataflows["call_c"].generate_event({ "c_0": "ccc"}, key="bbb") + print(event) + result = client.send(event) + assert result == 42 + + print("call C thru B") + event = a_op.dataflows["call_c_thru_b"].generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") + result = client.send(event) + assert result == 84 + +def init_python_runtime() -> tuple[PythonRuntime, PythonClientSync]: + runtime = PythonRuntime() + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.run() + return runtime, PythonClientSync(runtime) From eb53b890917ce17ea7e0ed98c7b80d735ea47444 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 1 Apr 2025 18:05:42 +0200 Subject: [PATCH 10/37] Get started on Flink Runtime conversion --- deathstar_movie_review/demo.py | 2 +- docker-compose.kafka.yml | 48 ++++++ src/cascade/runtime/flink_runtime.py | 65 +++++--- .../integration/{flink-runtime => }/common.py | 18 +- .../flink-runtime/test_collect_operator.py | 157 +++++++++++++----- .../flink-runtime/test_two_entities.py | 1 - 6 files changed, 207 insertions(+), 84 deletions(-) create mode 100644 docker-compose.kafka.yml rename tests/integration/{flink-runtime => }/common.py (89%) diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 893facf..02574f0 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -56,7 +56,7 @@ def main(): create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) - runtime.init(kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10) + runtime.init(kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True) print(f"Creating dataflow [{EXPERIMENT}]") diff --git a/docker-compose.kafka.yml b/docker-compose.kafka.yml new file mode 100644 index 0000000..33335f5 --- /dev/null +++ b/docker-compose.kafka.yml @@ -0,0 +1,48 @@ +version: '3.1' + +# https://docs.docker.com/guides/kafka/ + +services: + kafka: + image: apache/kafka-native + ports: + - "9092:9092" # for HOST connections + expose: + - "9093" # for DOCKER connections + environment: + # Configure listeners for both docker and host communication + KAFKA_LISTENERS: CONTROLLER://localhost:9091,HOST://0.0.0.0:9092,DOCKER://0.0.0.0:9093 + KAFKA_ADVERTISED_LISTENERS: HOST://localhost:9092,DOCKER://kafka:9093 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,DOCKER:PLAINTEXT,HOST:PLAINTEXT + + # Settings required for KRaft mode + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9091 + KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER + + # Required for a single node cluster + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + + # Low Latency Tuning + KAFKA_NUM_NETWORK_THREADS: 8 + KAFKA_NUM_IO_THREADS: 16 + KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 + KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 + KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 + KAFKA_SOCKET_RECEIVE_BUFFER_BYTES: 102400 + + # Change timestamp type for benchmark measurements + KAFKA_LOG_MESSAGE_TIMESTAMP_TYPE: LogAppendTime + + kafka-ui: + image: ghcr.io/kafbat/kafka-ui:latest + ports: + - 8080:8080 + environment: + DYNAMIC_CONFIG_ENABLED: "true" + KAFKA_CLUSTERS_0_NAME: local + KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9093 + depends_on: + - kafka \ No newline at end of file diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index 9b7eec6..e84e546 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -13,12 +13,12 @@ from pyflink.datastream import ProcessFunction, StreamExecutionEnvironment from pyflink.datastream.output_tag import OutputTag import pickle -from cascade.dataflow.dataflow import CollectNode, CollectTarget, Event, EventResult, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode +from cascade.dataflow.dataflow import CallLocal, CollectNode, CollectTarget, Event, EventResult, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode from cascade.dataflow.operator import StatefulOperator, StatelessOperator from confluent_kafka import Producer, Consumer import logging -logger = logging.getLogger(__name__) +logger = logging.getLogger("cascade") logger.setLevel("INFO") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -61,12 +61,13 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): logger.debug("FanOut Enter") - if isinstance(event.target, StatelessOpNode): - logger.debug(event.target.operator.name()) - tag = self.stateless_ops[event.target.operator.name()] - elif isinstance(event.target, OpNode): - logger.debug(event.target.entity.__name__) - tag = self.stateful_ops[event.target.entity.__name__] + if isinstance(event.target, CallLocal): + logger.debug(event) + tag = self.stateful_ops[event.dataflow.op_name] + # TODO: stateless ops + # elif isinstance(event.target, OpNode): + # logger.debug(event.target.entity.__name__) + # tag = self.stateful_ops[event.target.entity.__name__] else: logger.error(f"FanOut: Wrong target: {event}") return @@ -91,17 +92,17 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): event = profile_event(event, "STATEFUL OP INNER ENTRY") # should be handled by filters on this FlinkOperator - assert(isinstance(event.target, OpNode)) - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Processing: {event.target.method_type}") + assert(isinstance(event.target, CallLocal)) + logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Processing: {event.target.method}") - assert(event.target.entity == self.operator.entity) + assert(event.dataflow.op_name == self.operator.name()) key = ctx.get_current_key() assert(key is not None) - if isinstance(event.target.method_type, InitClass): + if isinstance(event.target.method, InitClass): # TODO: compile __init__ with only kwargs, and pass the variable_map itself # otherwise, order of variable_map matters for variable assignment - result = self.operator.handle_init_class(*event.variable_map.values()) + result = self.operator.handle_init_class(**event.variable_map) # Register the created key in FlinkSelectAllOperator if SELECT_ALL_ENABLED: @@ -114,8 +115,11 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Registering key: {register_key_event}") yield register_key_event - self.state.update(pickle.dumps(result)) - elif isinstance(event.target.method_type, InvokeMethod): + print(result) + print(type(result)) + # self.state.update(pickle.dumps(result)) + self.state.update(pickle.dumps(result.__dict__)) + elif isinstance(event.target.method, InvokeMethod): state = self.state.value() if state is None: # try to create the state if we haven't been init'ed @@ -123,7 +127,7 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): else: state = pickle.loads(state) - result = self.operator.handle_invoke_method(event.target.method_type, variable_map=event.variable_map, state=state) + result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map, state=state) # TODO: check if state actually needs to be updated if state is not None: @@ -136,8 +140,8 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): # return # result = event.key_stack[-1] - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result + # if event.target.assign_result_to is not None: + # event.variable_map[event.target.assign_result_to] = result new_events = event.propogate(result) if isinstance(new_events, EventResult): @@ -389,7 +393,12 @@ def __init__(self, input_topic="input-topic", output_topic="output-topic", ui_po Warning that this does not work well with run(collect=True)!""" - def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, parallelism=None): + self.stateless_operators: list[FlinkStatelessOperator] = [] + self.stateful_operators: list[FlinkOperator] = [] + """List of stateful operator streams, which gets appended at `add_operator`.""" + + + def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, parallelism=None, thread_mode=False): """Initialise & configure the Flink runtime. This function is required before any other calls, and requires a Kafka @@ -416,7 +425,14 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_integer("python.fn-execution.bundle.time", bundle_time) config.set_integer("python.fn-execution.bundle.size", bundle_size) - config.set_string("python.execution-mode", "thread") + # Thread mode has significant performance impacts, see + # https://flink.apache.org/2022/05/06/exploring-the-thread-mode-in-pyflink/ + # In short: + # much faster single threaded python performance + # GIL becomes an issue if running higher parallelism on the same taskmanager + # can't use with minicluster (e.g. while testing) + if thread_mode: + config.set_string("python.execution-mode", "thread") # METRICS if METRICS: @@ -540,10 +556,6 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para self.event_stream = event_stream - self.stateless_operators: list[FlinkStatelessOperator] = [] - self.stateful_operators: list[FlinkOperator] = [] - """List of stateful operator streams, which gets appended at `add_operator`.""" - logger.debug("FlinkRuntime initialized") def add_operator(self, op: StatefulOperator): @@ -571,7 +583,8 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka # create the fanout operator stateful_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateful_operators} stateless_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateless_operators} - logger.debug(f"{stateful_tags.items()}") + logger.debug(f"Stateful tags: {stateful_tags.items()}") + logger.debug(f"Stateless tags: {stateless_tags.items()}") fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags)).name("FANOUT OPERATOR").disable_chaining() # create the streams @@ -581,7 +594,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka op_stream = ( fanout .get_side_output(tag) - .key_by(lambda e: e.variable_map[e.target.read_key_from]) + .key_by(lambda e: e.key) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.name()) ) diff --git a/tests/integration/flink-runtime/common.py b/tests/integration/common.py similarity index 89% rename from tests/integration/flink-runtime/common.py rename to tests/integration/common.py index c53dd99..02f63be 100644 --- a/tests/integration/flink-runtime/common.py +++ b/tests/integration/common.py @@ -1,14 +1,16 @@ from typing import Any from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode from cascade.runtime.flink_runtime import StatefulOperator +import cascade +@cascade.cascade class User: def __init__(self, key: str, balance: int): self.key: str = key self.balance: int = balance def update_balance(self, amount: int) -> bool: - self.balance += amount + self.balance = self.balance + amount return self.balance >= 0 def get_balance(self) -> int: @@ -16,18 +18,17 @@ def get_balance(self) -> int: def buy_item(self, item: 'Item') -> bool: item_price = item.get_price() # SSA - self.balance -= item_price + self.balance = self.balance - item_price return self.balance >= 0 def buy_2_items(self, item1: 'Item', item2: 'Item') -> bool: item1_price = item1.get_price() # SSA item2_price = item2.get_price() # SSA - self.balance -= item1_price + item2_price + self.balance = self.balance - item1_price + item2_price return self.balance >= 0 - def __repr__(self): - return f"User(key='{self.key}', balance={self.balance})" - + +@cascade.cascade class Item: def __init__(self, key: str, price: int): self.key: str = key @@ -35,10 +36,7 @@ def __init__(self, key: str, price: int): def get_price(self) -> int: return self.price - - def __repr__(self): - return f"Item(key='{self.key}', price={self.price})" - + def update_balance_compiled(variable_map: dict[str, Any], state: User) -> Any: state.balance += variable_map["amount"] return state.balance >= 0 diff --git a/tests/integration/flink-runtime/test_collect_operator.py b/tests/integration/flink-runtime/test_collect_operator.py index d14418f..7eceecf 100644 --- a/tests/integration/flink-runtime/test_collect_operator.py +++ b/tests/integration/flink-runtime/test_collect_operator.py @@ -1,37 +1,79 @@ """A test script for dataflows with merge operators""" from pyflink.datastream.data_stream import CloseableIterator -from common import Item, User, item_op, user_op +from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.dataflow.dataflow import Event, EventResult, InitClass, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime +from cascade.runtime.flink_runtime import FlinkClientSync, FlinkOperator, FlinkRuntime import pytest +import cascade + +def init_flink_runtime() -> tuple[FlinkRuntime, FlinkClientSync]: + cascade.core.clear() + exec(f'import tests.integration.common') + cascade.core.init() + runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) + + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.init(parallelism=4) + return runtime, FlinkClientSync() + +import os +from confluent_kafka.admin import AdminClient, NewTopic +import logging + +KAFKA_BROKER = "localhost:9092" + +IN_TOPIC = "input-topic" +OUT_TOPIC = "output-topic" +INTERNAL_TOPIC = "internal-topic" + +def create_topics(*required_topics): + conf = { + "bootstrap.servers": KAFKA_BROKER + } + + admin_client = AdminClient(conf) + + # Fetch existing topics + existing_topics = admin_client.list_topics(timeout=5).topics.keys() + + # Find missing topics + missing_topics = [topic for topic in required_topics if topic not in existing_topics] + + if missing_topics: + print(f"Creating missing topics: {missing_topics}") + + # Define new topics (default: 1 partition, replication factor 1) + new_topics = [NewTopic(topic, num_partitions=32, replication_factor=1) for topic in missing_topics] + + # Create topics + futures = admin_client.create_topics(new_topics) + + # Wait for topic creation to complete + for topic, future in futures.items(): + try: + future.result() # Block until the operation is complete + print(f"Topic '{topic}' created successfully") + except Exception as e: + print(f"Failed to create topic '{topic}': {e}") + else: + print("All required topics exist.") + @pytest.mark.integration def test_merge_operator(): - runtime = FlinkRuntime("test_collect_operator") - runtime.init() - runtime.add_operator(item_op) - runtime.add_operator(user_op) - - - # Create a User object - foo_user = User("foo", 100) - init_user_node = OpNode(User, InitClass(), read_key_from="key") - event = Event(init_user_node, {"key": "foo", "balance": 100}, None) - runtime.send(event) - - # Create an Item object - fork_item = Item("fork", 5) - init_item_node = OpNode(Item, InitClass(), read_key_from="key") - event = Event(init_item_node, {"key": "fork", "price": 5}, None) - runtime.send(event) - - # Create another Item - spoon_item = Item("spoon", 3) - event = Event(init_item_node, {"key": "spoon", "price": 3}, None) - runtime.send(event, flush=True) - - collected_iterator: CloseableIterator = runtime.run(run_async=True, output="collect") + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) + runtime, client = init_flink_runtime() + + collected_iterator = runtime.run(run_async=True, output="collect") + assert isinstance(collected_iterator, CloseableIterator) records = [] def wait_for_event_id(id: int) -> EventResult: @@ -41,31 +83,54 @@ def wait_for_event_id(id: int) -> EventResult: if record.event_id == id: return record - # Make sure the user & items are initialised - wait_for_event_id(event._id) - # Have the User object buy the item - foo_user.buy_2_items(fork_item, spoon_item) - df = user_op.dataflows["buy_2_items"] + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + event = user_op.dataflows["__init__"].generate_event({"key": "foo", "balance": 100}, key="foo") + client.send(event) - # User with key "foo" buys item with key "fork" - user_buys_cutlery = Event(df.entry, {"user_key": "foo", "item1_key": "fork", "item2_key": "spoon"}, df) - runtime.send(user_buys_cutlery, flush=True) + result = wait_for_event_id(event[0]._id) + print(result.result.__dict__) - - # Check that we were able to buy the fork - buy_fork_result = wait_for_event_id(user_buys_cutlery._id) - assert buy_fork_result.result == True + event = item_op.dataflows["__init__"].generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_op.dataflows["__init__"].generate_event({"key": "spoon", "price": 3}, key="spoon") + client.send(event) + + result = wait_for_event_id(event[0]._id) + print(result.result.__dict__) - # Send an event to check if the balance was updated - user_get_balance_node = OpNode(User, InvokeMethod("get_balance"), read_key_from="key") - user_get_balance = Event(user_get_balance_node, {"key": "foo"}, None) - runtime.send(user_get_balance, flush=True) + - # See that the user's balance has gone down - get_balance = wait_for_event_id(user_get_balance._id) - assert get_balance.result == 92 + + # # Have the User object buy the item + # foo_user.buy_2_items(fork_item, spoon_item) + # df = user_op.dataflows["buy_2_items"] + + event = user_op.dataflows["buy_2_items"].generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id) + assert result.result == True + + + # Check the balance + event = user_op.dataflows["get_balance"].generate_event({}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id) + assert result.result == 92 collected_iterator.close() + client.close() + # # Send an event to check if the balance was updated + # user_get_balance_node = OpNode(User, InvokeMethod("get_balance"), read_key_from="key") + # user_get_balance = Event(user_get_balance_node, {"key": "foo"}, None) + # runtime.send(user_get_balance, flush=True) + + # # See that the user's balance has gone down + # get_balance = wait_for_event_id(user_get_balance._id) + # assert get_balance.result == 92 + + # collected_iterator.close() - print(records) \ No newline at end of file + # print(records) \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_two_entities.py b/tests/integration/flink-runtime/test_two_entities.py index 3d89bd2..722de21 100644 --- a/tests/integration/flink-runtime/test_two_entities.py +++ b/tests/integration/flink-runtime/test_two_entities.py @@ -1,7 +1,6 @@ """A test script for dataflows with multiple operators""" from pyflink.datastream.data_stream import CloseableIterator -from common import Item, User, item_op, user_op from cascade.dataflow.dataflow import Event, EventResult, InitClass, InvokeMethod, OpNode from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime import pytest From ce80d9231384e4e439179bbe3022a90431f62f7a Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Wed, 2 Apr 2025 16:32:35 +0200 Subject: [PATCH 11/37] Implement DeathstarBench on Flink with new IR --- deathstar_movie_review/entities/entities.py | 4 +- deathstar_movie_review/entities/frontend.py | 2 +- .../test_movie_review_demo.py | 105 ++++++++---- src/cascade/dataflow/dataflow.py | 51 +++--- .../dataflow/optimization/parallelization.py | 98 ++++++++++- src/cascade/runtime/flink_runtime.py | 145 ++++++++-------- src/cascade/runtime/python_runtime.py | 13 +- tests/integration/common.py | 93 +---------- .../flink-runtime/test_collect_operator.py | 136 --------------- .../flink-runtime/test_select_all.py | 155 ------------------ .../flink-runtime/test_two_entities.py | 74 --------- tests/integration/flink/__init__.py | 0 .../flink/test_collect_operator.py | 69 ++++++++ .../flink/test_stateful_operators.py | 63 +++++++ tests/integration/flink/utils.py | 66 ++++++++ tests/optimizations/test_parallelize.py | 94 +---------- 16 files changed, 467 insertions(+), 701 deletions(-) delete mode 100644 tests/integration/flink-runtime/test_collect_operator.py delete mode 100644 tests/integration/flink-runtime/test_select_all.py delete mode 100644 tests/integration/flink-runtime/test_two_entities.py create mode 100644 tests/integration/flink/__init__.py create mode 100644 tests/integration/flink/test_collect_operator.py create mode 100644 tests/integration/flink/test_stateful_operators.py create mode 100644 tests/integration/flink/utils.py diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py index f582073..424c3f0 100644 --- a/deathstar_movie_review/entities/entities.py +++ b/deathstar_movie_review/entities/entities.py @@ -3,9 +3,9 @@ @cascade class ComposeReview: - def __init__(self, req_id: str, *args): # *args is a temporary hack to allow for creation of composereview on the fly + def __init__(self, req_id: str, review_data: dict={}, **kwargs): # **args is a temporary hack to allow for creation of composereview on the fly self.req_id = req_id - self.review_data = {} + self.review_data = review_data def upload_unique_id(self, review_id: int): self.review_data["review_id"] = review_id diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py index 9da7e88..62c4e1e 100644 --- a/deathstar_movie_review/entities/frontend.py +++ b/deathstar_movie_review/entities/frontend.py @@ -93,7 +93,7 @@ def frontend_df_parallel(): # It could be more useful to have a "Dataflow" node df = DataFlow("compose") # n0 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - ct = CollectNode(assign_result_to="results", read_results_from="dummy") + ct = CollectNode() # Upload Unique DF n1_a = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 9cba7ba..0bfc7fb 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -1,19 +1,17 @@ +import logging import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) +from cascade.dataflow.optimization.parallelization import parallelize from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.dataflow.dataflow import Event, InitClass, InvokeMethod, OpNode -from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -from deathstar_movie_review.entities.compose_review import ComposeReview, compose_review_op -from deathstar_movie_review.entities.user import User, user_op -from deathstar_movie_review.entities.movie import MovieId, movie_id_op, movie_info_op, plot_op -from deathstar_movie_review.entities.frontend import frontend_op, text_op, unique_id_op, frontend_df_serial import cascade +import pytest +import tests.integration.flink.utils as utils def init_python_runtime() -> tuple[PythonRuntime, PythonClientSync]: runtime = PythonRuntime() @@ -26,7 +24,7 @@ def init_python_runtime() -> tuple[PythonRuntime, PythonClientSync]: runtime.run() return runtime, PythonClientSync(runtime) -import time + def test_deathstar_movie_demo_python(): print("starting") cascade.core.clear() @@ -34,10 +32,35 @@ def test_deathstar_movie_demo_python(): cascade.core.init() runtime, client = init_python_runtime() + deathstar_movie_demo(client) + +@pytest.mark.integration +def test_deathstar_movie_demo_flink(): + print("starting") + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime, client = utils.init_flink_runtime("deathstar_movie_review.entities.entities") + runtime.run(run_async=True) + + try: + deathstar_movie_demo(client) + finally: + client.close() + +def deathstar_movie_demo(client): user_op = cascade.core.operators["User"] compose_op = cascade.core.operators["ComposeReview"] movie_op = cascade.core.operators["MovieId"] frontend_op = cascade.core.operators["Frontend"] + df = parallelize(frontend_op.dataflows["compose"]) + df.name = "compose_parallel" + frontend_op.dataflows["compose_parallel"] = df + print(frontend_op.dataflows["compose_parallel"].to_dot()) + print(frontend_op.dataflows) + assert len(frontend_op.dataflows["compose_parallel"].entry) == 4 for df in cascade.core.dataflows.values(): print(df.to_dot()) @@ -55,28 +78,24 @@ def test_deathstar_movie_demo_python(): print("testing user create") event = user_op.dataflows["__init__"].generate_event({"username": username, "user_data": user_data}, username) - result = client.send(event) + result = client.send(event, block=True) print(result) assert result.username == username print("testing compose review") - req_id = "1" + req_id = "4242" movie_title = "Cars 2" movie_id = 1 # make the review event = compose_op.dataflows["__init__"].generate_event({"req_id": req_id}, req_id) - result = client.send(event) + result = client.send(event, block=True) print("review made") - - # # make the movie # init_movie = OpNode(MovieId, InitClass(), read_key_from="title") event = movie_op.dataflows["__init__"].generate_event({"title": movie_title, "movie_id": movie_id}, movie_title) - result = client.send(event) - # event = Event(init_movie, {"title": movie_title, "movie_id": movie_id}, None) - # result = client.send(event) + result = client.send(event, block=True) print("movie made") # compose the review @@ -91,32 +110,60 @@ def test_deathstar_movie_demo_python(): r_data = {r+"_0": v for r, v in review_data.items()} event = frontend_op.dataflows["compose"].generate_event(r_data) - result = client.send(event, block=False) - + result = client.send(event, block=True) print(result) print("review composed") - # read the review - get_review = OpNode(ComposeReview, InvokeMethod("get_data"), read_key_from="req_id") - event = Event( - get_review, - {"req_id": req_id}, - None - ) event = compose_op.dataflows["get_data"].generate_event({"req_id": req_id}, req_id) - result = client.send(event) + result = client.send(event, block=True) print(result) - print(runtime.statefuloperators["ComposeReview"].states["1"].review_data) - # time.sleep(0.5) - # result = client.send(event) expected = { "userId": user_data["userId"], "movieId": movie_id, "text": review_data["text"] } - # print(result, expected) + + assert "review_id" in result + del result["review_id"] # randomly generated + assert result == expected + + + + ## NOW DO IT PARALLEL! + # make the review + new_req_id = "43" + event = compose_op.dataflows["__init__"].generate_event({"req_id": new_req_id}, new_req_id) + result = client.send(event, block=True) + print("review made") + + # compose the review + review_data = { + "review": req_id, + "user": username, + "title": movie_title, + "rating": 2, + "text": "bad movie!" + } + + r_data = {r+"_0": v for r, v in review_data.items()} + + event = frontend_op.dataflows["compose_parallel"].generate_event(r_data) + result = client.send(event, block=True) + print(result) + print("review composed") + + event = compose_op.dataflows["get_data"].generate_event({"req_id": req_id}, req_id) + result = client.send(event, block=True) + print(result) + + expected = { + "userId": user_data["userId"], + "movieId": movie_id, + "text": "bad movie!" + } + assert "review_id" in result del result["review_id"] # randomly generated assert result == expected \ No newline at end of file diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 0afb1ef..cae3356 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Any, Callable, List, Mapping, Optional, Type, Union +from typing import Any, Callable, Iterable, List, Mapping, Optional, Type, Union from typing import TYPE_CHECKING import uuid @@ -241,13 +241,8 @@ class CollectNode(Node): It will aggregate incoming edges and output them as a list to the outgoing edge. Their actual implementation is runtime-dependent.""" - assign_result_to: str = "" - """The variable name in the variable map that will contain the collected result.""" - read_results_from: str = "" - """The variable name in the variable map that the individual items put their result in.""" def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: - # collect_targets = [event.collect_target for i in range(len(targets))] return [Event( target, event.variable_map, @@ -400,15 +395,24 @@ def to_dot(self) -> str: lines.append("}") return "\n".join(lines) - def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> Union['Event', list['Event']]: - if isinstance(self.entry, list): + def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> list['Event']: assert len(self.entry) != 0 # give all the events the same id first_event = Event(self.entry[0], variable_map, self, key=key) id = first_event._id - return [first_event] + [Event(entry, variable_map, self, _id=id, key=key) for entry in self.entry[1:]] - else: - return Event(self.entry, variable_map, self, key=key) + events = [first_event] + [Event(entry, variable_map, self, _id=id, key=key) for entry in self.entry[1:]] + + # TODO: propogate at "compile time" instead of doing this every time + local_events = [] + for ev in events: + if isinstance(ev.target, CallEntity): + local_events.extend(ev.propogate(None)) + else: + local_events.append(ev) + + return local_events + + def __repr__(self) -> str: return f"{self.op_name}.{self.name}" @@ -474,10 +478,12 @@ def __post_init__(self): # Assign a unique ID self._id = uuid.uuid4().int - def propogate(self, result: Any) -> Union['EventResult', list['Event']]: + def propogate(self, result: Any) -> Iterable[Union['EventResult', 'Event']]: """Propogate this event through the Dataflow.""" targets = self.dataflow.get_neighbors(self.target) + events = [] + if len(targets) == 0 and not isinstance(self.target, CallEntity): if len(self.call_stack) > 0: caller = self.call_stack.pop() @@ -490,8 +496,6 @@ def propogate(self, result: Any) -> Union['EventResult', list['Event']]: if (x := caller.assign_result_to): var_map[x] = result - events = [] - for target in new_targets: ev = Event( target, @@ -502,18 +506,21 @@ def propogate(self, result: Any) -> Union['EventResult', list['Event']]: metadata=self.metadata, key=caller.key ) - events.append(ev) - - return events - + events.append(ev) else: - return EventResult(self._id, result, self.metadata) + yield EventResult(self._id, result, self.metadata) + return else: current_node = self.target - new = current_node.propogate(self, targets, result) - return new - + events = current_node.propogate(self, targets, result) + + for event in events: + if isinstance(event.target, CallEntity): + # recursively propogate CallEntity events + yield from event.propogate(None) + else: + yield event @dataclass class EventResult(): event_id: int diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 79e3ea4..0dd2b12 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -180,12 +180,92 @@ https://en.wikipedia.org/wiki/Optimizing_compiler#Specific_techniques """ -from cascade.dataflow.operator import StatefulOperator, StatelessOperator - - -def node_parallelization(stateful_ops: list[StatefulOperator], stateless_ops: list[StatelessOperator]): - # Find parallelizable nodes - for op in stateful_ops: - for dataflow in op.dataflows.values(): - pass - # Parallize them \ No newline at end of file +from dataclasses import dataclass +from typing import Any +from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, DataflowRef, Edge, Event, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode +from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator + + +@dataclass +class AnnotatedNode: + node: Node + reads: list[str] + writes: list[str] + + +import networkx as nx +def parallelize(df: DataFlow): + # create the dependency graph + ans = [] + # since we use SSA, every variable has exactly one node that writes it + write_nodes = {} + graph = nx.DiGraph() + for node in df.nodes.values(): + if isinstance(node, CallEntity): + reads = list(node.variable_rename.values()) + writes = [result] if (result := node.assign_result_to) else [] + elif isinstance(node, CallLocal): + method = df.get_operator().methods[node.method.method_name] + reads = method.var_map_reads + writes = method.var_map_writes + else: + raise ValueError(f"unsupported node type: {type(node)}") + + write_nodes.update({var: node.id for var in writes}) + + ans.append(AnnotatedNode(node, reads, writes)) + graph.add_node(node.id) + + nodes_with_indegree_0 = set(graph.nodes) + n_map = df.nodes + for node in ans: + for read in node.reads: + if read in write_nodes: + # "read" will not be in write nodes if it is part of the arguments + # a more thorough implementation would not need the if check, + # and add the arguments as writes to some function entry node + graph.add_edge(write_nodes[read], node.node.id) + try: + nodes_with_indegree_0.remove(node.node.id) + except KeyError: + pass + + updated = DataFlow(df.name, df.op_name) + updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] + prev_node = None + + while len(nodes_with_indegree_0) > 0: + # remove nodes from graph + children = [] + for node_id in nodes_with_indegree_0: + children.extend(graph.successors(node_id)) + graph.remove_node(node_id) + updated.add_node(n_map[node_id]) + + + # check for new indegree 0 nodes + next_nodes = set() + for child in children: + if graph.in_degree(child) == 0: + next_nodes.add(child) + + if len(nodes_with_indegree_0) > 1: + # TODO: maybe collect node should just infer from it's predecessors? + # like it can only have DataFlowNode predecessors + # TODO: rename DataflowNode to EntityCall + collect_node = CollectNode() + for node_id in nodes_with_indegree_0: + if prev_node: + updated.add_edge(Edge(prev_node, n_map[node_id])) + updated.add_edge(Edge(n_map[node_id], collect_node)) + prev_node = collect_node + else: + node_id = nodes_with_indegree_0.pop() + if prev_node: + updated.add_edge(Edge(prev_node, n_map[node_id])) + + prev_node = n_map[node_id] + + nodes_with_indegree_0 = next_nodes + + return updated diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index e84e546..e351b97 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -59,20 +59,20 @@ def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): event = profile_event(event, "FanOut") - logger.debug("FanOut Enter") + logger.debug(f"FanOut Event entered: {event._id}") if isinstance(event.target, CallLocal): logger.debug(event) - tag = self.stateful_ops[event.dataflow.op_name] - # TODO: stateless ops - # elif isinstance(event.target, OpNode): - # logger.debug(event.target.entity.__name__) - # tag = self.stateful_ops[event.target.entity.__name__] + if event.dataflow.op_name in self.stateful_ops: + tag = self.stateful_ops[event.dataflow.op_name] + else: + tag = self.stateless_ops[event.dataflow.op_name] + else: logger.error(f"FanOut: Wrong target: {event}") return - logger.debug(f"Fanout: {tag.tag_id}") + logger.debug(f"Fanout Event routed to: {tag.tag_id}") yield tag, event @@ -112,26 +112,29 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): None, _id = event._id ) - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Registering key: {register_key_event}") + logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Registering key: {register_key_event}") yield register_key_event - print(result) - print(type(result)) # self.state.update(pickle.dumps(result)) self.state.update(pickle.dumps(result.__dict__)) + elif isinstance(event.target.method, InvokeMethod): state = self.state.value() if state is None: - # try to create the state if we haven't been init'ed - state = self.operator.handle_init_class(*event.variable_map.values()) - else: - state = pickle.loads(state) + logger.error(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: State does not exist for key {ctx.get_current_key()}") + raise KeyError + + state = pickle.loads(state) + # TODO: don't create a new class instance, instead use the __dict__ directly in self.state + # requires changes in compilation, i.e. self.balance -> state["balance"] + state = self.operator.entity(**state) result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map, state=state) # TODO: check if state actually needs to be updated if state is not None: - self.state.update(pickle.dumps(state)) + # TODO: "state" should already be the __dict__ itself. + self.state.update(pickle.dumps(state.__dict__)) # Filter targets are used in cases of [hotel for hotel in Hotel.__all__() *if hotel....*] # elif isinstance(event.target.method_type, Filter): # state = pickle.loads(self.state.value()) @@ -143,13 +146,14 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): # if event.target.assign_result_to is not None: # event.variable_map[event.target.assign_result_to] = result - new_events = event.propogate(result) - if isinstance(new_events, EventResult): - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Returned {new_events}") - yield new_events + new_events = list(event.propogate(result)) + + if len(new_events) == 1 and isinstance(new_events[0], EventResult): + logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Returned {new_events[0]}") else: - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") - yield from new_events + logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") + + yield from new_events class FlinkStatelessOperator(ProcessFunction): """Wraps an `cascade.dataflow.datflow.StatefulOperator` in a KeyedProcessFunction so that it can run in Flink. @@ -159,28 +163,27 @@ def __init__(self, operator: StatelessOperator) -> None: self.operator = operator - def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + def process_element(self, event: Event, ctx: ProcessFunction.Context): event = profile_event(event, "STATELESS OP INNER ENTRY") - # should be handled by filters on this FlinkOperator - assert(isinstance(event.target, StatelessOpNode)) + assert isinstance(event.target, CallLocal) - logger.debug(f"FlinkStatelessOperator {self.operator.dataflow.name}[{event._id}]: Processing: {event.target.method_type}") - if isinstance(event.target.method_type, InvokeMethod): - result = self.operator.handle_invoke_method(event.target.method_type, variable_map=event.variable_map) + logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Processing: {event.target.method}") + if isinstance(event.target.method, InvokeMethod): + result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map) else: raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method_type}") - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result - new_events = event.propogate(result) - if isinstance(new_events, EventResult): - logger.debug(f"FlinkStatelessOperator {self.operator.dataflow.name}[{event._id}]: Returned {new_events}") - yield new_events + new_events = list(event.propogate(result)) + + if len(new_events) == 1 and isinstance(new_events[0], EventResult): + logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Returned {new_events[0]}") else: - logger.debug(f"FlinkStatelessOperator {self.operator.dataflow.name}[{event._id}]: Propogated {len(new_events)} new Events") - yield from new_events + logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Propogated {len(new_events)} new Events") + + yield from new_events + class FlinkSelectAllOperator(KeyedProcessFunction): """A process function that yields all keys of a certain class""" @@ -225,15 +228,6 @@ class Result(ABC): their computation.""" pass -@dataclass -class Arrived(Result): - val: Any - -@dataclass -class NotArrived(Result): - pass - - class FlinkCollectOperator(KeyedProcessFunction): """Flink implementation of a merge operator.""" def __init__(self): @@ -241,49 +235,36 @@ def __init__(self): def open(self, runtime_context: RuntimeContext): descriptor = ValueStateDescriptor("merge_state", Types.PICKLED_BYTE_ARRAY()) - self.collection = runtime_context.get_state(descriptor) + self.var_map = runtime_context.get_state(descriptor) def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): event = profile_event(event, "COLLECT OP INNER ENTRY") - collection: list[Result] = self.collection.value() + var_map_num_items = self.var_map.value() logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processing: {event}") - # for now we assume there is only 1 merge per df - assert event.collect_target is not None - entry: CollectTarget = event.collect_target - target_node: CollectNode = entry.target_node + total_events = len(event.dataflow.get_predecessors(event.target)) # Add to the map - if collection == None: + if var_map_num_items == None: logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Creating map") - collection = [NotArrived()] * entry.total_items - logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processed event {entry.result_idx} ({entry.total_items})") - - result = None - try: - result = event.variable_map[target_node.read_results_from] - except KeyError: - pass - - collection[entry.result_idx] = Arrived(result) - self.collection.update(collection) + combined_var_map = {} + num_items = 0 + else: + combined_var_map, num_items = var_map_num_items + + combined_var_map.update(event.variable_map) + num_items += 1 + logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Recieved {num_items}/{total_events} Events") + - # Yield events if the merge is done - if all([isinstance(r, Arrived) for r in collection]): + if num_items == total_events: logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Yielding collection") - - collection = [r.val for r in collection if r.val is not None] # type: ignore (r is of type Arrived) - event.variable_map[target_node.assign_result_to] = collection - new_events = event.propogate(collection) - - self.collection.clear() - if isinstance(new_events, EventResult): - logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Returned {new_events}") - yield new_events - else: - logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") - yield from new_events + event.variable_map = combined_var_map + yield from event.propogate(None) + self.var_map.clear() + else: + self.var_map.update((combined_var_map, num_items)) class ByteSerializer(SerializationSchema, DeserializationSchema): @@ -723,17 +704,23 @@ def consume_results(self): def flush(self): self.producer.flush() - def send(self, event: Union[Event, list[Event]], flush=False) -> int: + def send(self, event: Union[Event, list[Event]], flush=False, block=False) -> int: if isinstance(event, list): for e in event: id = self._send(e) else: id = self._send(event) - if flush: + if flush or block: self.producer.flush() - return id + if block: + while (r := self._futures[id]["ret"]) == None: + time.sleep(0.1) + + return r.result + else: + return id def _send(self, event: Event) -> int: diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index ac12106..99c52e8 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -79,15 +79,8 @@ def process(self, event: Event): for event in self.state[key]: var_map.update(event.variable_map) - new_event = Event( - target=event.target, - variable_map=var_map, - dataflow=event.dataflow, - _id=event._id, - call_stack=event.call_stack, - metadata=event.metadata - ) - new_events = new_event.propogate(None) + event.variable_map = var_map + new_events = event.propogate(None) if isinstance(new_events, EventResult): yield new_events else: @@ -95,8 +88,6 @@ def process(self, event: Event): - - class PythonRuntime(): """Simple non-distributed runtime meant for testing that runs Dataflows locally.""" def __init__(self): diff --git a/tests/integration/common.py b/tests/integration/common.py index 02f63be..f7e887c 100644 --- a/tests/integration/common.py +++ b/tests/integration/common.py @@ -24,7 +24,8 @@ def buy_item(self, item: 'Item') -> bool: def buy_2_items(self, item1: 'Item', item2: 'Item') -> bool: item1_price = item1.get_price() # SSA item2_price = item2.get_price() # SSA - self.balance = self.balance - item1_price + item2_price + total = item1_price + item2_price + self.balance = self.balance - total return self.balance >= 0 @@ -35,92 +36,4 @@ def __init__(self, key: str, price: int): self.price: int = price def get_price(self) -> int: - return self.price - -def update_balance_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance += variable_map["amount"] - return state.balance >= 0 - -def get_balance_compiled(variable_map: dict[str, Any], state: User) -> Any: - return state.balance - -def get_price_compiled(variable_map: dict[str, Any], state: Item) -> Any: - return state.price - -def buy_item_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - -def buy_item_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance = state.balance - variable_map["item_price"] - return state.balance >= 0 - - -def buy_2_items_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - -def buy_2_items_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance -= variable_map["item_prices"][0] + variable_map["item_prices"][1] - return state.balance >= 0 - -# An operator is defined by the underlying class and the functions that can be called -user_op = StatefulOperator( - User, - { - "update_balance": update_balance_compiled, - "get_balance": get_balance_compiled, - "buy_item_0": buy_item_0_compiled, - "buy_item_1": buy_item_1_compiled, - "buy_2_items_0": buy_2_items_0_compiled, - "buy_2_items_1": buy_2_items_1_compiled - }, - {}) - -item_op = StatefulOperator( - Item, {"get_price": get_price_compiled}, {} -) - - -def user_buy_item_df(): - df = DataFlow("user.buy_item") - n0 = OpNode(User, InvokeMethod("buy_item_0"), read_key_from="user_key") - n1 = OpNode(Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - read_key_from="item_key") - n2 = OpNode(User, InvokeMethod("buy_item_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.entry = n0 - return df - -def user_buy_2_items_df(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(User, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n3 = CollectNode(assign_result_to="item_prices", read_results_from="item_price") - n1 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 0), - read_key_from="item1_key" - ) - n2 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 1), - read_key_from="item2_key" - ) - n4 = OpNode(User, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n3)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.entry = n0 - return df - -user_op.dataflows = { - "buy_2_items": user_buy_2_items_df(), - "buy_item": user_buy_item_df() - } \ No newline at end of file + return self.price \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_collect_operator.py b/tests/integration/flink-runtime/test_collect_operator.py deleted file mode 100644 index 7eceecf..0000000 --- a/tests/integration/flink-runtime/test_collect_operator.py +++ /dev/null @@ -1,136 +0,0 @@ -"""A test script for dataflows with merge operators""" - -from pyflink.datastream.data_stream import CloseableIterator -from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.dataflow.dataflow import Event, EventResult, InitClass, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import FlinkClientSync, FlinkOperator, FlinkRuntime -import pytest - -import cascade - -def init_flink_runtime() -> tuple[FlinkRuntime, FlinkClientSync]: - cascade.core.clear() - exec(f'import tests.integration.common') - cascade.core.init() - runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) - - for op in cascade.core.operators.values(): - if isinstance(op, StatefulOperator): - runtime.add_operator(op) - elif isinstance(op, StatelessOperator): - runtime.add_stateless_operator(op) - - runtime.init(parallelism=4) - return runtime, FlinkClientSync() - -import os -from confluent_kafka.admin import AdminClient, NewTopic -import logging - -KAFKA_BROKER = "localhost:9092" - -IN_TOPIC = "input-topic" -OUT_TOPIC = "output-topic" -INTERNAL_TOPIC = "internal-topic" - -def create_topics(*required_topics): - conf = { - "bootstrap.servers": KAFKA_BROKER - } - - admin_client = AdminClient(conf) - - # Fetch existing topics - existing_topics = admin_client.list_topics(timeout=5).topics.keys() - - # Find missing topics - missing_topics = [topic for topic in required_topics if topic not in existing_topics] - - if missing_topics: - print(f"Creating missing topics: {missing_topics}") - - # Define new topics (default: 1 partition, replication factor 1) - new_topics = [NewTopic(topic, num_partitions=32, replication_factor=1) for topic in missing_topics] - - # Create topics - futures = admin_client.create_topics(new_topics) - - # Wait for topic creation to complete - for topic, future in futures.items(): - try: - future.result() # Block until the operation is complete - print(f"Topic '{topic}' created successfully") - except Exception as e: - print(f"Failed to create topic '{topic}': {e}") - else: - print("All required topics exist.") - -@pytest.mark.integration -def test_merge_operator(): - logger = logging.getLogger("cascade") - logger.setLevel("DEBUG") - create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) - runtime, client = init_flink_runtime() - - collected_iterator = runtime.run(run_async=True, output="collect") - assert isinstance(collected_iterator, CloseableIterator) - records = [] - - def wait_for_event_id(id: int) -> EventResult: - for record in collected_iterator: - records.append(record) - print(f"Collected record: {record}") - if record.event_id == id: - return record - - - user_op = cascade.core.operators["User"] - item_op = cascade.core.operators["Item"] - event = user_op.dataflows["__init__"].generate_event({"key": "foo", "balance": 100}, key="foo") - client.send(event) - - result = wait_for_event_id(event[0]._id) - print(result.result.__dict__) - - event = item_op.dataflows["__init__"].generate_event({"key": "fork", "price": 5}, key="fork") - client.send(event) - - event = item_op.dataflows["__init__"].generate_event({"key": "spoon", "price": 3}, key="spoon") - client.send(event) - - result = wait_for_event_id(event[0]._id) - print(result.result.__dict__) - - - - - # # Have the User object buy the item - # foo_user.buy_2_items(fork_item, spoon_item) - # df = user_op.dataflows["buy_2_items"] - - event = user_op.dataflows["buy_2_items"].generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") - client.send(event) - result = wait_for_event_id(event[0]._id) - assert result.result == True - - - # Check the balance - event = user_op.dataflows["get_balance"].generate_event({}, key="foo") - client.send(event) - result = wait_for_event_id(event[0]._id) - assert result.result == 92 - - collected_iterator.close() - client.close() - # # Send an event to check if the balance was updated - # user_get_balance_node = OpNode(User, InvokeMethod("get_balance"), read_key_from="key") - # user_get_balance = Event(user_get_balance_node, {"key": "foo"}, None) - # runtime.send(user_get_balance, flush=True) - - # # See that the user's balance has gone down - # get_balance = wait_for_event_id(user_get_balance._id) - # assert get_balance.result == 92 - - # collected_iterator.close() - - # print(records) \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_select_all.py b/tests/integration/flink-runtime/test_select_all.py deleted file mode 100644 index 9e0360d..0000000 --- a/tests/integration/flink-runtime/test_select_all.py +++ /dev/null @@ -1,155 +0,0 @@ -# """ -# The select all operator is used to fetch all keys for a single entity -# """ -# import math -# import random -# from dataclasses import dataclass -# from typing import Any - -# from pyflink.datastream.data_stream import CloseableIterator - -# from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, Event, EventResult, InitClass, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -# from cascade.dataflow.operator import StatefulOperator, StatelessOperator -# from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime, FlinkStatelessOperator -# import time -# import pytest - -# @dataclass -# class Geo: -# x: int -# y: int - -# class Hotel: -# def __init__(self, name: str, loc: Geo): -# self.name = name -# self.loc = loc - -# def get_name(self) -> str: -# return self.name - -# def distance(self, loc: Geo) -> float: -# return math.sqrt((self.loc.x - loc.x) ** 2 + (self.loc.y - loc.y) ** 2) - -# def __repr__(self) -> str: -# return f"Hotel({self.name}, {self.loc})" - - -# def distance_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: -# loc = variable_map["loc"] -# return math.sqrt((state.loc.x - loc.x) ** 2 + (state.loc.y - loc.y) ** 2) - -# def get_name_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: -# return state.name - -# hotel_op = StatefulOperator(Hotel, -# {"distance": distance_compiled, -# "get_name": get_name_compiled}, {}) - - - -# def get_nearby(hotels: list[Hotel], loc: Geo, dist: float): -# return [hotel.get_name() for hotel in hotels if hotel.distance(loc) < dist] - - -# # We compile just the predicate, the select is implemented using a selectall node -# def get_nearby_predicate_compiled_0(variable_map: dict[str, Any]): -# pass - -# def get_nearby_predicate_compiled_1(variable_map: dict[str, Any]) -> bool: -# loc = variable_map["loc"] -# dist = variable_map["dist"] -# hotel_dist = variable_map["hotel_distance"] -# return hotel_dist < dist - -# def get_nearby_body_compiled_0(variable_map: dict[str, Any]): -# pass - -# def get_nearby_body_compiled_1(variable_map: dict[str, Any]) -> str: -# return variable_map["hotel_name"] - -# get_nearby_op = StatelessOperator({ -# "get_nearby_predicate_compiled_0": get_nearby_predicate_compiled_0, -# "get_nearby_predicate_compiled_1": get_nearby_predicate_compiled_1, -# "get_nearby_body_compiled_0": get_nearby_body_compiled_0, -# "get_nearby_body_compiled_1": get_nearby_body_compiled_1, -# }, None) - -# # dataflow for getting all hotels within region -# df = DataFlow("get_nearby") -# n7 = CollectNode("get_nearby_result", "get_nearby_body") -# n0 = SelectAllNode(Hotel, n7, assign_key_to="hotel_key") -# n1 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_0")) -# n2 = OpNode(Hotel, InvokeMethod("distance"), assign_result_to="hotel_distance", read_key_from="hotel_key") -# n3 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_1"), is_conditional=True) -# n4 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_0")) -# n5 = OpNode(Hotel, InvokeMethod("get_name"), assign_result_to="hotel_name", read_key_from="hotel_key") -# n6 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_1"), assign_result_to="get_nearby_body") - -# df.add_edge(Edge(n0, n1)) -# df.add_edge(Edge(n1, n2)) -# df.add_edge(Edge(n2, n3)) -# df.add_edge(Edge(n3, n4, if_conditional=True)) -# df.add_edge(Edge(n3, n7, if_conditional=False)) -# df.add_edge(Edge(n4, n5)) -# df.add_edge(Edge(n5, n6)) -# df.add_edge(Edge(n6, n7)) -# get_nearby_op.dataflow = df - -# @pytest.mark.integration -# def test_nearby_hotels(): -# runtime = FlinkRuntime("test_nearby_hotels") -# runtime.init() -# runtime.add_operator(hotel_op) -# runtime.add_stateless_operator(get_nearby_op) - -# # Create Hotels -# hotels = [] -# init_hotel = OpNode(Hotel, InitClass(), read_key_from="name") -# random.seed(42) -# for i in range(20): -# coord_x = random.randint(-10, 10) -# coord_y = random.randint(-10, 10) -# hotel = Hotel(f"h_{i}", Geo(coord_x, coord_y)) -# event = Event(init_hotel, {"name": hotel.name, "loc": hotel.loc}, None) -# runtime.send(event) -# hotels.append(hotel) - -# collected_iterator: CloseableIterator = runtime.run(run_async=True, output='collect') -# records = [] -# def wait_for_event_id(id: int) -> EventResult: -# for record in collected_iterator: -# records.append(record) -# print(f"Collected record: {record}") -# if record.event_id == id: -# return record - -# def wait_for_n_records(num: int) -> list[EventResult]: -# i = 0 -# n_records = [] -# for record in collected_iterator: -# i += 1 -# records.append(record) -# n_records.append(record) -# print(f"Collected record: {record}") -# if i == num: -# return n_records - -# print("creating hotels") -# # Wait for hotels to be created -# wait_for_n_records(20) -# time.sleep(10) # wait for all hotels to be registered - -# dist = 5 -# loc = Geo(0, 0) -# event = Event(n0, {"loc": loc, "dist": dist}, df) -# runtime.send(event, flush=True) - -# nearby = [] -# for hotel in hotels: -# if hotel.distance(loc) < dist: -# nearby.append(hotel.name) - -# event_result = wait_for_event_id(event._id) -# results = [r for r in event_result.result if r != None] -# print(nearby) -# assert set(results) == set(nearby) \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_two_entities.py b/tests/integration/flink-runtime/test_two_entities.py deleted file mode 100644 index 722de21..0000000 --- a/tests/integration/flink-runtime/test_two_entities.py +++ /dev/null @@ -1,74 +0,0 @@ -"""A test script for dataflows with multiple operators""" - -from pyflink.datastream.data_stream import CloseableIterator -from cascade.dataflow.dataflow import Event, EventResult, InitClass, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime -import pytest - -@pytest.mark.integration -def test_two_entities(): - runtime = FlinkRuntime("test_two_entities") - runtime.init() - runtime.add_operator(item_op) - runtime.add_operator(user_op) - - # Create a User object - foo_user = User("foo", 100) - init_user_node = OpNode(User, InitClass(), read_key_from="key") - event = Event(init_user_node, {"key": "foo", "balance": 100}, None) - runtime.send(event) - - # Create an Item object - fork_item = Item("fork", 5) - init_item_node = OpNode(Item, InitClass(), read_key_from="key") - event = Event(init_item_node, {"key": "fork", "price": 5}, None) - runtime.send(event) - - # Create an expensive Item - house_item = Item("house", 1000) - event = Event(init_item_node, {"key": "house", "price": 1000}, None) - runtime.send(event) - - # Have the User object buy the item - foo_user.buy_item(fork_item) - df = user_op.dataflows["buy_item"] - - # User with key "foo" buys item with key "fork" - user_buys_fork = Event(df.entry, {"user_key": "foo", "item_key": "fork"}, df) - runtime.send(user_buys_fork, flush=True) - - collected_iterator: CloseableIterator = runtime.run(run_async=True, output="collect") - records = [] - - def wait_for_event_id(id: int) -> EventResult: - for record in collected_iterator: - records.append(record) - print(f"Collected record: {record}") - if record.event_id == id: - return record - - # Check that we were able to buy the fork - buy_fork_result = wait_for_event_id(user_buys_fork._id) - assert buy_fork_result.result == True - - # Send an event to check if the balance was updated - user_get_balance_node = OpNode(User, InvokeMethod("get_balance"), read_key_from="key") - user_get_balance = Event(user_get_balance_node, {"key": "foo"}, None) - runtime.send(user_get_balance, flush=True) - - # See that the user's balance has gone down - get_balance = wait_for_event_id(user_get_balance._id) - assert get_balance.result == 95 - - # User with key "foo" buys item with key "house" - foo_user.buy_item(house_item) - user_buys_house = Event(df.entry, {"user_key": "foo", "item_key": "house"}, df) - runtime.send(user_buys_house, flush=True) - - # Balance becomes negative when house is bought - buy_house_result = wait_for_event_id(user_buys_house._id) - assert buy_house_result.result == False - - collected_iterator.close() - - print(records) \ No newline at end of file diff --git a/tests/integration/flink/__init__.py b/tests/integration/flink/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py new file mode 100644 index 0000000..a76bc97 --- /dev/null +++ b/tests/integration/flink/test_collect_operator.py @@ -0,0 +1,69 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.optimization.parallelization import parallelize + +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_collect_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime, client = utils.init_flink_runtime("tests.integration.common") + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_collect_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_collect_operator(client, collector): + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + df = parallelize(user_op.dataflows["buy_2_items"]) + df.name = "buy_2_parallel" + user_op.dataflows["buy_2_parallel"] = df + print(user_op.dataflows["buy_2_parallel"].to_dot()) + print(user_op.dataflows) + assert len(user_op.dataflows["buy_2_parallel"].entry) == 2 + + + event = user_op.dataflows["__init__"].generate_event({"key": "foo", "balance": 100}, key="foo") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result.__dict__) + + event = item_op.dataflows["__init__"].generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_op.dataflows["__init__"].generate_event({"key": "spoon", "price": 3}, key="spoon") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result.__dict__) + + + # Buy a fork and spoon + event = user_op.dataflows["buy_2_parallel"].generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == True + + + # Check the balance + event = user_op.dataflows["get_balance"].generate_event({}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == (100 - 5 - 3) diff --git a/tests/integration/flink/test_stateful_operators.py b/tests/integration/flink/test_stateful_operators.py new file mode 100644 index 0000000..b7d8479 --- /dev/null +++ b/tests/integration/flink/test_stateful_operators.py @@ -0,0 +1,63 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id + +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_stateful_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime, client = utils.init_flink_runtime("tests.integration.common") + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_stateful_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_stateful_operator(client, collector): + + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + event = user_op.dataflows["__init__"].generate_event({"key": "foo", "balance": 100}, key="foo") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result.__dict__) + + event = item_op.dataflows["__init__"].generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_op.dataflows["__init__"].generate_event({"key": "spoon", "price": 3}, key="spoon") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result.__dict__) + + + print(user_op.dataflows["buy_2_items"].to_dot()) + + # Buy a fork and spoon + event = user_op.dataflows["buy_2_items"].generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == True + + + # Check the balance + event = user_op.dataflows["get_balance"].generate_event({}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == (100 - 5 - 3) diff --git a/tests/integration/flink/utils.py b/tests/integration/flink/utils.py new file mode 100644 index 0000000..b0e805b --- /dev/null +++ b/tests/integration/flink/utils.py @@ -0,0 +1,66 @@ +import cascade +from cascade.dataflow.dataflow import EventResult +from cascade.dataflow.operator import StatefulOperator, StatelessOperator +from cascade.runtime.flink_runtime import FlinkClientSync, FlinkRuntime +from confluent_kafka.admin import AdminClient, NewTopic +from pyflink.datastream.data_stream import CloseableIterator + + +KAFKA_BROKER = "localhost:9092" + +IN_TOPIC = "input-topic" +OUT_TOPIC = "output-topic" +INTERNAL_TOPIC = "internal-topic" + +def wait_for_event_id(id: int, collector: CloseableIterator) -> EventResult: + for record in collector: + print(f"Collected record: {record}") + if record.event_id == id: + return record + + +def init_flink_runtime(import_path: str) -> tuple[FlinkRuntime, FlinkClientSync]: + cascade.core.clear() + exec(f'import {import_path}') + cascade.core.init() + runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) + + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.init(parallelism=4) + return runtime, FlinkClientSync() + +def create_topics(*required_topics): + if len(required_topics) == 0: + required_topics = (IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) + + conf = { + "bootstrap.servers": KAFKA_BROKER + } + + admin_client = AdminClient(conf) + + # Define new topics (default: 1 partition, replication factor 1) + new_topics = [NewTopic(topic, num_partitions=32, replication_factor=1) for topic in required_topics] + + # Delete topics + futures = admin_client.delete_topics(list(required_topics)) + for topic, future in futures.items(): + try: + future.result() # Block until the operation is complete + print(f"Topic '{topic}' deleted successfully") + except Exception as e: + print(f"Failed to delete topic '{topic}': {e}") + + # Create topics + futures = admin_client.create_topics(new_topics) + for topic, future in futures.items(): + try: + future.result() # Block until the operation is complete + print(f"Topic '{topic}' recreated successfully") + except Exception as e: + print(f"Failed to create topic '{topic}': {e}") \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index e429610..941dfce 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -6,13 +6,8 @@ # import cascade sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) - +from cascade.dataflow.optimization.parallelization import parallelize from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -from dataclasses import dataclass -from typing import Any -from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, DataflowRef, Edge, Event, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode -from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator - import cascade def test_parallelize(): @@ -62,90 +57,3 @@ def test_parallelize(): event = test_op.dataflows["get_total_parallel"].generate_event({"item1_0": "fork", "item2_0": "spoon"}) result = client.send(event) assert result == 30 - -@dataclass -class AnnotatedNode: - node: Node - reads: list[str] - writes: list[str] - -import networkx as nx -def parallelize(df: DataFlow): - # create the dependency graph - ans = [] - # since we use SSA, every variable has exactly one node that writes it - write_nodes = {} - graph = nx.DiGraph() - for node in df.nodes.values(): - if isinstance(node, CallEntity): - reads = list(node.variable_rename.values()) - writes = [result] if (result := node.assign_result_to) else [] - elif isinstance(node, CallLocal): - method = df.get_operator().methods[node.method.method_name] - reads = method.var_map_reads - writes = method.var_map_writes - else: - raise ValueError(f"unsupported node type: {type(node)}") - - write_nodes.update({var: node.id for var in writes}) - - ans.append(AnnotatedNode(node, reads, writes)) - graph.add_node(node.id) - - nodes_with_indegree_0 = set(graph.nodes) - n_map = df.nodes - for node in ans: - for read in node.reads: - print(read) - if read in write_nodes: - # "read" will not be in write nodes if it is part of the arguments - # a more thorough implementation would not need the if check, - # and add the arguments as writes to some function entry node - graph.add_edge(write_nodes[read], node.node.id) - try: - nodes_with_indegree_0.remove(node.node.id) - except KeyError: - pass - - updated = DataFlow(df.name, df.op_name) - updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] - prev_node = None - print(nodes_with_indegree_0) - - while len(nodes_with_indegree_0) > 0: - # remove nodes from graph - children = [] - for node_id in nodes_with_indegree_0: - children.extend(graph.successors(node_id)) - graph.remove_node(node_id) - updated.add_node(n_map[node_id]) - - - # check for new indegree 0 nodes - next_nodes = set() - for child in children: - if graph.in_degree(child) == 0: - next_nodes.add(child) - - if len(nodes_with_indegree_0) > 1: - # TODO: maybe collect node should just infer from it's predecessors? - # like it can only have DataFlowNode predecessors - # TODO: rename DataflowNode to EntityCall - collect_node = CollectNode() - for node_id in nodes_with_indegree_0: - if prev_node: - updated.add_edge(Edge(prev_node, n_map[node_id])) - updated.add_edge(Edge(n_map[node_id], collect_node)) - prev_node = collect_node - else: - node_id = nodes_with_indegree_0.pop() - if prev_node: - updated.add_edge(Edge(prev_node, n_map[node_id])) - - prev_node = n_map[node_id] - - nodes_with_indegree_0 = next_nodes - - print(df.to_dot()) - print(updated.to_dot()) - return updated From d3c45fe70a5efcaca6ae0bf655da18e9ffb48201 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 3 Apr 2025 11:32:51 +0200 Subject: [PATCH 12/37] Remove OpNode --- src/cascade/dataflow/dataflow.py | 113 ++---------------- src/cascade/dataflow/operator.py | 3 +- .../dataflow/optimization/parallelization.py | 3 +- .../optimization/test_dead_node_elim.py | 6 +- .../frontend/generator/generate_dataflow.py | 2 +- src/cascade/runtime/flink_runtime.py | 4 +- src/cascade/runtime/python_runtime.py | 2 +- .../dataflow_analysis/test_split_functions.py | 25 +--- tests/integration/common.py | 3 - tests/integration/test_single_entity.py | 19 --- 10 files changed, 17 insertions(+), 163 deletions(-) delete mode 100644 tests/integration/test_single_entity.py diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index cae3356..4b6cb80 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Any, Callable, Iterable, List, Mapping, Optional, Type, Union +from typing import Any, Iterable, List, Mapping, Optional, Union from typing import TYPE_CHECKING import uuid @@ -8,8 +8,6 @@ if TYPE_CHECKING: # Prevent circular imports - from cascade.dataflow.operator import StatelessOperator - from cascade.dataflow.operator import StatefulOperator from cascade.dataflow.operator import Block @@ -58,97 +56,6 @@ def __post_init__(self): def propogate(self, event: 'Event', targets: list['Node'], result: Any, **kwargs) -> list['Event']: pass -@dataclass -class OpNode(Node): - """A node in a `Dataflow` corresponding to a method call of a `StatefulOperator`. - - A `Dataflow` may reference the same entity multiple times. - The `StatefulOperator` that this node belongs to is referenced by `entity`.""" - operator: 'StatefulOperator' - method_type: Union[InitClass, InvokeMethod] - read_key_from: str - """Which variable to take as the key for this StatefulOperator""" - - assign_result_to: Optional[str] = field(default=None) - """What variable to assign the result of this node to, if any.""" - is_conditional: bool = field(default=False) - """Whether or not the boolean result of this node dictates the following path.""" - collect_target: Optional['CollectTarget'] = field(default=None) - """Whether the result of this node should go to a CollectNode.""" - - def propogate(self, event: 'Event', targets: List[Node], result: Any) -> list['Event']: - return OpNode.propogate_opnode(self, event, targets, result) - - @staticmethod - def propogate_opnode(node: Union['OpNode', 'StatelessOpNode'], event: 'Event', targets: list[Node], - result: Any) -> list['Event']: - num_targets = 1 if node.is_conditional else len(targets) - - if event.collect_target is not None: - # Assign new collect targets - collect_targets = [ - event.collect_target for i in range(num_targets) - ] - else: - # Keep old collect targets - collect_targets = [node.collect_target for i in range(num_targets)] - - if node.is_conditional: - edges = event.dataflow.nodes[event.target.id].outgoing_edges - true_edges = [edge for edge in edges if edge.if_conditional] - false_edges = [edge for edge in edges if not edge.if_conditional] - if not (len(true_edges) == len(false_edges) == 1): - print(edges) - assert len(true_edges) == len(false_edges) == 1 - target_true = true_edges[0].to_node - target_false = false_edges[0].to_node - - assert len(collect_targets) == 1, "num targets should be 1" - ct = collect_targets[0] - - return [Event( - target_true if result else target_false, - event.variable_map, - event.dataflow, - _id=event._id, - collect_target=ct, - metadata=event.metadata) - ] - - else: - return [Event( - target, - event.variable_map, - event.dataflow, - _id=event._id, - collect_target=ct, - metadata=event.metadata) - - for target, ct in zip(targets, collect_targets)] - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self.entity.__name__}, {self.method_type})" - -@dataclass -class StatelessOpNode(Node): - """A node in a `Dataflow` corresponding to a method call of a `StatelessOperator`. - - A `Dataflow` may reference the same `StatefulOperator` multiple times. - The `StatefulOperator` that this node belongs to is referenced by `cls`.""" - operator: 'StatelessOperator' - method_type: InvokeMethod - """Which variable to take as the key for this StatefulOperator""" - - assign_result_to: Optional[str] = None - """What variable to assign the result of this node to, if any.""" - is_conditional: bool = False - """Whether or not the boolean result of this node dictates the following path.""" - collect_target: Optional['CollectTarget'] = None - """Whether the result of this node should go to a CollectNode.""" - - def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['Event']: - return OpNode.propogate_opnode(self, event, targets, result) - @dataclass class DataflowRef: operator_name: str @@ -286,14 +193,17 @@ class DataFlow: collect-- [item1_price, item2_price] -->user2; ``` """ - # TODO: op should not be optional - def __init__(self, name: str, op_name: str=None, args: list[str]=None): + + def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.name: str = name self.adjacency_list: dict[int, list[int]] = {} self.nodes: dict[int, Node] = {} self.entry: List[Node] = [] self.op_name = op_name - self.args = args + if args: + self.args: list[str] = args + else: + self.args = [] def get_operator(self) -> Operator: return cascade.core.operators[self.op_name] @@ -329,11 +239,6 @@ def remove_node(self, node: Node): return # Node doesn't exist in the graph - # if isinstance(node, OpNode) or isinstance(node, StatelessOpNode): - # assert not node.is_conditional, "there's no clear way to remove a conditional node" - # assert not node.assign_result_to, "can't delete node whose result is used" - # assert not node.collect_target, "can't delete node which has a collect_target" - # Find parents (nodes that have edges pointing to this node) parents = [parent_id for parent_id, children in self.adjacency_list.items() if node.id in children] @@ -364,7 +269,6 @@ def remove_node(self, node: Node): child_node = self.nodes[child_id] self.remove_edge(node, child_node) - # Remove the node from the adjacency list and nodes dictionary del self.adjacency_list[node.id] del self.nodes[node.id] @@ -413,8 +317,7 @@ def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None return local_events - - def __repr__(self) -> str: + def __str__(self) -> str: return f"{self.op_name}.{self.name}" @dataclass diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index fa8074f..7385202 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -32,8 +32,7 @@ class Block(ABC): var_map_reads: list[str] name: str function_call: Union[MethodCall, 'StatelessMethodCall'] - # TODO: remove "None" - raw_method_string: str = None + raw_method_string: str def call(self, *args, **kwargs) -> Any: return self.function_call(*args, **kwargs) diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 0dd2b12..0b26c6f 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -182,8 +182,7 @@ from dataclasses import dataclass from typing import Any -from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, DataflowRef, Edge, Event, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode -from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator +from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, Edge, Node @dataclass diff --git a/src/cascade/dataflow/optimization/test_dead_node_elim.py b/src/cascade/dataflow/optimization/test_dead_node_elim.py index 87f03c4..18dbe5e 100644 --- a/src/cascade/dataflow/optimization/test_dead_node_elim.py +++ b/src/cascade/dataflow/optimization/test_dead_node_elim.py @@ -1,6 +1,6 @@ from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode +from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod from cascade.dataflow.operator import StatefulOperator from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from cascade.dataflow.optimization.dead_node_elim import is_no_op @@ -66,8 +66,8 @@ def user_order_df(): df.entry = n0 return df -df = user_order_df() -user_op.dataflows[df.name] = df +# df = user_order_df() +# user_op.dataflows[df.name] = df def DEPRECATED_test_dead_node_elim(): print(user_op.dataflows[df.name].to_dot()) diff --git a/src/cascade/frontend/generator/generate_dataflow.py b/src/cascade/frontend/generator/generate_dataflow.py index 3bcd62d..a96189e 100644 --- a/src/cascade/frontend/generator/generate_dataflow.py +++ b/src/cascade/frontend/generator/generate_dataflow.py @@ -1,6 +1,6 @@ from cascade.dataflow.operator import Block from cascade.frontend.generator.split_function import SplitFunction -from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, OpNode, InvokeMethod, Edge +from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod, Edge class GenerateDataflow: diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index e351b97..d674abc 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -13,7 +13,7 @@ from pyflink.datastream import ProcessFunction, StreamExecutionEnvironment from pyflink.datastream.output_tag import OutputTag import pickle -from cascade.dataflow.dataflow import CallLocal, CollectNode, CollectTarget, Event, EventResult, InitClass, InvokeMethod, Node, OpNode, StatelessOpNode +from cascade.dataflow.dataflow import CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod, Node from cascade.dataflow.operator import StatefulOperator, StatelessOperator from confluent_kafka import Producer, Consumer import logging @@ -100,8 +100,6 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): assert(key is not None) if isinstance(event.target.method, InitClass): - # TODO: compile __init__ with only kwargs, and pass the variable_map itself - # otherwise, order of variable_map matters for variable assignment result = self.operator.handle_init_class(**event.variable_map) # Register the created key in FlinkSelectAllOperator diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index 99c52e8..f419c25 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -1,7 +1,7 @@ import threading from typing import List, Union from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod, OpNode, StatelessOpNode +from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod from queue import Empty, Queue import time diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index b5e4a47..7afef23 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -47,27 +47,4 @@ def get_total(item1: Stock, item2: Stock, y: int): df, blocks = sf.build(dataflows, "Test") print(df.to_dot()) - print(blocks) - - - -# [ -# Statement(block_num=0, block=Function get_total in scope Class "Test" in scope Module, targets=[item1_0, item2_0], values=[item1_0, item2_0], remote_call=False, attribute=None), -# Statement(block_num=1, block=Assign: (a_0,) = 10, targets=[a_0], values=[], remote_call=False, attribute=None), -# Statement(block_num=2, block=Assign: (b_0,) = BinOp: a_0 + 3, targets=[b_0], values=[a_0], remote_call=False, attribute=None), -# Statement(block_num=3, block=Assign: (x_0,) = Call: item1_0.get_quantity(()), targets=[x_0], values=[item1_0], remote_call=True, attribute=item1_0.get_quantity), -# Statement(block_num=4, block=Assign: (y_0,) = Call: item2_0.get_quantity(()), targets=[y_0], values=[item2_0], remote_call=True, attribute=item2_0.get_quantity), -# Statement(block_num=5, block=Assign: (total_0,) = Call: Adder.add((x_0, y_0)), targets=[total_0], values=[Adder, x_0, y_0], remote_call=True, attribute=Adder.add), -# Statement(block_num=6, block=, targets=[total_1], values=[a_0, b_0], remote_call=False, attribute=None), -# Statement(block_num=7, block=, targets=[total_2], values=[], remote_call=False, attribute=None), -# Statement(block_num=8, block=, targets=[], values=[total_2], remote_call=False, attribute=None)] - -# [ -# (0, 3), -# (0, 4), -# (3, 5), -# (4, 5), -# (1, 2), -# (1, 6), -# (2, 6), -# (7, 8)] \ No newline at end of file + print(blocks) \ No newline at end of file diff --git a/tests/integration/common.py b/tests/integration/common.py index f7e887c..4afd1e6 100644 --- a/tests/integration/common.py +++ b/tests/integration/common.py @@ -1,6 +1,3 @@ -from typing import Any -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import StatefulOperator import cascade @cascade.cascade diff --git a/tests/integration/test_single_entity.py b/tests/integration/test_single_entity.py deleted file mode 100644 index d4e77d3..0000000 --- a/tests/integration/test_single_entity.py +++ /dev/null @@ -1,19 +0,0 @@ -# todo: annotate with @cascade.entity -class User: - def __init__(self, key: str, balance: int): - self.key = key - self.balance = balance - - def set_balance(self, balance: int): - self.balance = balance - - def get_balance(self) -> int: - return self.balance - - -def test_single_entity(): - user = User("user", 100) - assert user.get_balance() == 100 - - user.set_balance(10) - assert user.get_balance() == 10 \ No newline at end of file From fabdc18e89bd27b9a1338687fc9b1b0fe092e655 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 3 Apr 2025 14:02:08 +0200 Subject: [PATCH 13/37] Use __dict__ as state instead of class instance --- .../frontend/ast_visitors/replace_name.py | 56 ++++++++++++++++--- src/cascade/runtime/flink_runtime.py | 12 ++-- src/cascade/runtime/python_runtime.py | 2 +- .../frontend/ast_visitors/test_self_rename.py | 31 ++++++++++ .../ast_visitors/test_variable_getter.py | 2 - .../dataflow_analysis/test_entities.py | 9 +-- tests/programs/test_programs.py | 4 +- 7 files changed, 92 insertions(+), 24 deletions(-) create mode 100644 tests/frontend/ast_visitors/test_self_rename.py diff --git a/src/cascade/frontend/ast_visitors/replace_name.py b/src/cascade/frontend/ast_visitors/replace_name.py index c98bd85..c578c8c 100644 --- a/src/cascade/frontend/ast_visitors/replace_name.py +++ b/src/cascade/frontend/ast_visitors/replace_name.py @@ -1,3 +1,4 @@ +from typing import Union from klara.core.ssa_visitors import AstVisitor from klara.core import nodes @@ -22,21 +23,60 @@ def replace_name(self, node: nodes.Name): node.id = self.new node.version = -1 + def replace_node(self, parent: nodes.BaseNode, old_node: nodes.BaseNode, new_node: nodes.BaseNode): + # get node children + for field in parent._fields: + attr = getattr(parent, field) + if isinstance(attr, (tuple, list)): + to_change = None + for i, n in enumerate(attr): + if n == old_node: + to_change = i + + if to_change is not None: + if isinstance(attr[i], tuple): + new_attr = list(attr) + new_attr[i] = new_node + attr = tuple(new_attr) + else: + attr[i] = new_node + setattr(parent, field, attr) + else: + if attr is not None: + if attr == old_node: + setattr(parent, field, new_node) + else: + continue + + + def replace_attribute(self, node: Union[nodes.Attribute, nodes.AssignAttribute]): + # change self -> state + node.value.id = self.new + node.value.version = -1 + + # change attribute to subscript + new_node = nodes.Subscript(node.lineno, None, node.parent, node.links, version=-1) + slice = nodes.Index(new_node.lineno, None, new_node) + slice.postinit(nodes.Const(node.attr, slice.lineno, slice.col_offset, slice)) + new_node.postinit(node.value, slice, node.ctx) + assert isinstance(node.parent, nodes.BaseNode) + self.replace_node(node.parent, node, new_node) + + def visit_subscript(self, node: nodes.Subscript): # e.g. self_0.data["something"]_0 -> state.data["something"] if isinstance(node.value, nodes.Attribute): - name = node.value.value - if str(name) == self.target: - self.replace_name(name) + attr = node.value + if str(attr.value) == self.target: + self.replace_attribute(attr) node.version = -1 def visit_assignattribute(self, node: nodes.AssignAttribute): - if str(node.value) == self.target : - self.replace_name(node.value) - node.version = -1 + if str(node.value) == self.target: + self.replace_attribute(node) def visit_attribute(self, node: nodes.Attribute): if str(node.value) == self.target: - self.replace_name(node.value) - node.version = -1 + self.replace_attribute(node) + diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index d674abc..e8f7922 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -113,7 +113,6 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Registering key: {register_key_event}") yield register_key_event - # self.state.update(pickle.dumps(result)) self.state.update(pickle.dumps(result.__dict__)) elif isinstance(event.target.method, InvokeMethod): @@ -123,16 +122,12 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): raise KeyError state = pickle.loads(state) - # TODO: don't create a new class instance, instead use the __dict__ directly in self.state - # requires changes in compilation, i.e. self.balance -> state["balance"] - state = self.operator.entity(**state) result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map, state=state) # TODO: check if state actually needs to be updated if state is not None: - # TODO: "state" should already be the __dict__ itself. - self.state.update(pickle.dumps(state.__dict__)) + self.state.update(pickle.dumps(state)) # Filter targets are used in cases of [hotel for hotel in Hotel.__all__() *if hotel....*] # elif isinstance(event.target.method_type, Filter): # state = pickle.loads(self.state.value()) @@ -579,7 +574,6 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka ) self.stateful_op_streams.append(op_stream) - self.stateless_op_streams = [] for flink_op in self.stateless_operators: tag = stateless_tags[flink_op.operator.name()] @@ -647,6 +641,10 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka else: logger.info("FlinkRuntime starting (sync)") self.env.execute("Cascade: Flink Runtime") + + def close(self): + assert self.env is not None, "FlinkRuntime must first be initialised with `init()`." + self.env.close() class FlinkClientSync: def __init__(self, input_topic="input-topic", output_topic="output-topic", kafka_url="localhost:9092", start_consumer_thread: bool = True): diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index f419c25..6efbd90 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -20,7 +20,7 @@ def process(self, event: Event): if isinstance(event.target.method, InitClass): result = self.operator.handle_init_class(*event.variable_map.values()) - self.states[key] = result + self.states[key] = result.__dict__ elif isinstance(event.target.method, InvokeMethod): state = self.states[key] diff --git a/tests/frontend/ast_visitors/test_self_rename.py b/tests/frontend/ast_visitors/test_self_rename.py new file mode 100644 index 0000000..d047d0c --- /dev/null +++ b/tests/frontend/ast_visitors/test_self_rename.py @@ -0,0 +1,31 @@ +from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState +from cascade.frontend.util import setup_cfg +from cascade.frontend.ast_visitors.variable_getter import VariableGetter +from klara.core import nodes + +def test_replace_self_with_state(): + code = "self.balance = self.balance + 10" + cfg = setup_cfg(code) + ssa_code = cfg.block_list[1].ssa_code + node, = ssa_code.code_list + ReplaceSelfWithState.replace(node) + + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.BinOp) + assert str(node.targets[0]) == "state['balance']" + assert str(node.value.left) == "state['balance']" + +def test_replace_self_with_state_dict(): + code = "self.data['b'] = self.data['a'] + self.balance" + cfg = setup_cfg(code) + ssa_code = cfg.block_list[1].ssa_code + node, = ssa_code.code_list + ReplaceSelfWithState.replace(node) + + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.BinOp) + assert str(node.targets[0]) == "state['data']['b']" + assert str(node.value.left) == "state['data']['a']" + assert str(node.value.right) == "state['balance']" \ No newline at end of file diff --git a/tests/frontend/ast_visitors/test_variable_getter.py b/tests/frontend/ast_visitors/test_variable_getter.py index d38553d..5ceaff3 100644 --- a/tests/frontend/ast_visitors/test_variable_getter.py +++ b/tests/frontend/ast_visitors/test_variable_getter.py @@ -1,8 +1,6 @@ from cascade.frontend.util import setup_cfg from cascade.frontend.ast_visitors.variable_getter import VariableGetter -from klara.core.tree_rewriter import AstBuilder - def test_variable_getter(): code = "item_price = item.get_price()" diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 356f99d..ea9ed8d 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -99,7 +99,8 @@ def buy_item(self, item: 'Item') -> bool: df, blocks = sf.build(dataflows, "User") assert len(blocks) == 1 - + func = blocks[0].call + print(blocks[0].raw_method_string) @dataclass class User: @@ -109,10 +110,10 @@ class User: func = blocks[0].call user = User("a", 20) - func({"item_price_0": 10}, user) + func({"item_price_0": 10}, user.__dict__) assert user.balance == 10 - func({"item_price_0": 13}, user) + func({"item_price_0": 13}, user.__dict__) assert user.balance == -3 def test_dict_state(): @@ -149,5 +150,5 @@ class User: print(blocks[0].raw_method_string) user = User("req", {}) - func({"review_id_0": 123}, user) + func({"review_id_0": 123}, user.__dict__) assert user.review_data["review_id"] == 123 diff --git a/tests/programs/test_programs.py b/tests/programs/test_programs.py index d5f25a3..d7abc97 100644 --- a/tests/programs/test_programs.py +++ b/tests/programs/test_programs.py @@ -61,12 +61,12 @@ def test_checkout_item(): event = user_op.dataflows["buy_item"].generate_event({"item_0": "fork"}, key=user.__key__()) result = client.send(event) - assert runtime.statefuloperators["User"].states["test"].balance == 5 + assert runtime.statefuloperators["User"].states["test"]["balance"] == 5 assert result event = user_op.dataflows["buy_item"].generate_event({"item_0": "spoon"}, key=user.__key__()) result = client.send(event) - assert runtime.statefuloperators["User"].states["test"].balance == -15 + assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 assert not result From 850c39c1e42f6f466e1ad90651879bae1d89dc29 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 3 Apr 2025 14:17:46 +0200 Subject: [PATCH 14/37] Test setting __dict__ variables in init method --- deathstar_movie_review/entities/entities.py | 4 ++-- .../frontend/dataflow_analysis/test_entities.py | 16 ++++++---------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py index 424c3f0..028f0f4 100644 --- a/deathstar_movie_review/entities/entities.py +++ b/deathstar_movie_review/entities/entities.py @@ -3,9 +3,9 @@ @cascade class ComposeReview: - def __init__(self, req_id: str, review_data: dict={}, **kwargs): # **args is a temporary hack to allow for creation of composereview on the fly + def __init__(self, req_id: str, **kwargs): # **args is a temporary hack to allow for creation of composereview on the fly self.req_id = req_id - self.review_data = review_data + self.review_data = {} def upload_unique_id(self, review_id: int): self.review_data["review_id"] = review_id diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index ea9ed8d..2495e3c 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -1,17 +1,13 @@ from dataclasses import dataclass from textwrap import dedent -import networkx as nx from klara.core.cfg import Cfg from klara.core import nodes from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef -from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder -from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements -from cascade.frontend.generator.split_function import SplitFunction2, to_entity_call -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph + +from cascade.frontend.generator.generate_split_functions import GroupStatements from cascade.frontend.util import setup_cfg def test_call_entity(): @@ -141,7 +137,7 @@ def upload_unique_id(self, review_id: int): @dataclass - class User: + class ComposeReview: req_id: str review_data: dict @@ -149,6 +145,6 @@ class User: print(blocks[0].raw_method_string) - user = User("req", {}) - func({"review_id_0": 123}, user.__dict__) - assert user.review_data["review_id"] == 123 + compose_review = ComposeReview("req", {}) + func({"review_id_0": 123}, compose_review.__dict__) + assert compose_review.review_data["review_id"] == 123 From c5b1dea92e3d8dd661db77edf16ac3c6cb88891c Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 3 Apr 2025 16:57:41 +0200 Subject: [PATCH 15/37] include self in writes when used as attribute --- .../frontend/ast_visitors/variable_getter.py | 4 ++++ tests/frontend/ast_visitors/test_variable_getter.py | 13 ++++++++++++- tests/programs/test_programs.py | 3 --- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/cascade/frontend/ast_visitors/variable_getter.py b/src/cascade/frontend/ast_visitors/variable_getter.py index 8a8300b..97b7b8c 100644 --- a/src/cascade/frontend/ast_visitors/variable_getter.py +++ b/src/cascade/frontend/ast_visitors/variable_getter.py @@ -1,4 +1,5 @@ from klara.core.ssa_visitors import AstVisitor +from klara.core import nodes class VariableGetter(AstVisitor): """get all variables (ast.name) from given node, separate by targets and values @@ -21,3 +22,6 @@ def visit_name(self, node): def visit_assignname(self, node): self.targets.append(node) + + def visit_assignattribute(self, node: nodes.AssignAttribute): + self.targets.append(node.value) diff --git a/tests/frontend/ast_visitors/test_variable_getter.py b/tests/frontend/ast_visitors/test_variable_getter.py index 5ceaff3..f59fa5d 100644 --- a/tests/frontend/ast_visitors/test_variable_getter.py +++ b/tests/frontend/ast_visitors/test_variable_getter.py @@ -12,4 +12,15 @@ def test_variable_getter(): values_as_string = [repr(v) for v in variable_getter.values] assert targets_as_string == ['item_price_0'] assert values_as_string == ['item'] - \ No newline at end of file + + +def test_variable_getter_attr(): + code = "self.balance = self.balance + 1" + cfg = setup_cfg(code) + ssa_code = cfg.block_list[1].ssa_code + node, = ssa_code.code_list + variable_getter = VariableGetter.get_variable(node) + targets_as_string = [repr(t) for t in variable_getter.targets] + values_as_string = [repr(v) for v in variable_getter.values] + assert targets_as_string == ['self'] + assert values_as_string == ['self'] \ No newline at end of file diff --git a/tests/programs/test_programs.py b/tests/programs/test_programs.py index d7abc97..53febcd 100644 --- a/tests/programs/test_programs.py +++ b/tests/programs/test_programs.py @@ -38,9 +38,6 @@ def test_checkout_item(): cascade.core.init() assert cascade.core.registered_classes, "The Cascade module classes should be registered at this point." - for op in cascade.core.operators.values(): - print(op.methods) - runtime, client = init_python_runtime() item_op = cascade.core.operators["Item"] user_op = cascade.core.operators["User"] From 6339f4d00378fba977227a40219d503685328419 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 3 Apr 2025 17:10:00 +0200 Subject: [PATCH 16/37] move test_programs --- test_programs/__init__.py | 0 test_programs/target/__init__.py | 0 .../integration/pyruntime}/checkout_item.py | 0 .../pyruntime}/operator_chaining.py | 0 .../pyruntime}/test_programs.py | 63 ++----------------- .../pyruntime/utils.py} | 19 +++++- tests/programs/README.md | 2 - tests/programs/__init__.py | 0 8 files changed, 23 insertions(+), 61 deletions(-) delete mode 100644 test_programs/__init__.py delete mode 100644 test_programs/target/__init__.py rename {test_programs/target => tests/integration/pyruntime}/checkout_item.py (100%) rename {test_programs/target => tests/integration/pyruntime}/operator_chaining.py (100%) rename tests/{programs => integration/pyruntime}/test_programs.py (54%) rename tests/{programs/util.py => integration/pyruntime/utils.py} (81%) delete mode 100644 tests/programs/README.md delete mode 100644 tests/programs/__init__.py diff --git a/test_programs/__init__.py b/test_programs/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_programs/target/__init__.py b/test_programs/target/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_programs/target/checkout_item.py b/tests/integration/pyruntime/checkout_item.py similarity index 100% rename from test_programs/target/checkout_item.py rename to tests/integration/pyruntime/checkout_item.py diff --git a/test_programs/target/operator_chaining.py b/tests/integration/pyruntime/operator_chaining.py similarity index 100% rename from test_programs/target/operator_chaining.py rename to tests/integration/pyruntime/operator_chaining.py diff --git a/tests/programs/test_programs.py b/tests/integration/pyruntime/test_programs.py similarity index 54% rename from tests/programs/test_programs.py rename to tests/integration/pyruntime/test_programs.py index 53febcd..0c27723 100644 --- a/tests/programs/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -1,46 +1,19 @@ -import os -import pytest import cascade import sys - -from cascade.dataflow.dataflow import Event from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -from tests.programs.util import compare_targets_with_expected - - -target_program_relative_path: str = 'test_programs/target' -expected_program_relative_path: str = 'test_programs/expected' +from tests.integration.pyruntime.utils import init_python_runtime -def get_target_file_list(): - target_files: list[str] = os.listdir(target_program_relative_path) - return list(filter(lambda f: f.endswith('.py') and '__init__' not in f, target_files)) - -target_files: list[str] = get_target_file_list() - -# @pytest.mark.parametrize("file_name", target_files) def test_checkout_item(): file_name = "checkout_item.py" - for key in list(sys.modules.keys()): - if key.startswith("test_programs"): - del sys.modules[key] - - cascade.core.clear() # clear cascadeds registerd classes. - assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ - Module" - # import the module - import_module_name: str = f'test_programs.target.{file_name.strip(".py")}' - exec(f'import {import_module_name}') - - cascade.core.init() - assert cascade.core.registered_classes, "The Cascade module classes should be registered at this point." - runtime, client = init_python_runtime() + runtime, client = init_python_runtime(file_name) item_op = cascade.core.operators["Item"] user_op = cascade.core.operators["User"] + event = item_op.dataflows["__init__"].generate_event({"item_name": "fork", "price": 10}, key="fork") result = client.send(event) assert result.price == 10 @@ -65,26 +38,11 @@ def test_checkout_item(): result = client.send(event) assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 assert not result - def test_operator_chaining(): file_name = "operator_chaining.py" - for key in list(sys.modules.keys()): - if key.startswith("test_programs"): - del sys.modules[key] - - cascade.core.clear() - import_module_name: str = f'test_programs.target.{file_name.strip(".py")}' - exec(f'import {import_module_name}') - cascade.core.init() - - for op in cascade.core.operators.values(): - print(op.methods) - - for df in cascade.core.dataflows.values(): - print(df.to_dot()) - - runtime, client = init_python_runtime() + + runtime, client = init_python_runtime(file_name) a_op = cascade.core.operators["A"] b_op = cascade.core.operators["B"] c_op = cascade.core.operators["C"] @@ -115,14 +73,3 @@ def test_operator_chaining(): event = a_op.dataflows["call_c_thru_b"].generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") result = client.send(event) assert result == 84 - -def init_python_runtime() -> tuple[PythonRuntime, PythonClientSync]: - runtime = PythonRuntime() - for op in cascade.core.operators.values(): - if isinstance(op, StatefulOperator): - runtime.add_operator(op) - elif isinstance(op, StatelessOperator): - runtime.add_stateless_operator(op) - - runtime.run() - return runtime, PythonClientSync(runtime) diff --git a/tests/programs/util.py b/tests/integration/pyruntime/utils.py similarity index 81% rename from tests/programs/util.py rename to tests/integration/pyruntime/utils.py index 0312d40..686a5f9 100644 --- a/tests/programs/util.py +++ b/tests/integration/pyruntime/utils.py @@ -3,8 +3,25 @@ import ast import difflib -import importlib +import cascade +from cascade.dataflow.operator import StatefulOperator, StatelessOperator +from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +def init_python_runtime(file_name: str) -> tuple[PythonRuntime, PythonClientSync]: + cascade.core.clear() + import_module_name: str = f'tests.integration.pyruntime.{file_name.strip(".py")}' + exec(f'import {import_module_name}') + cascade.core.init() + + runtime = PythonRuntime() + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.run() + return runtime, PythonClientSync(runtime) # colors red = lambda text: f"\033[38;2;255;0;0m{text}\033[38;2;255;255;255m" diff --git a/tests/programs/README.md b/tests/programs/README.md deleted file mode 100644 index 7c63d04..0000000 --- a/tests/programs/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Test programs -`test_programs.py` scans files in ./target folder, compiles them and tests them to expected. \ No newline at end of file diff --git a/tests/programs/__init__.py b/tests/programs/__init__.py deleted file mode 100644 index e69de29..0000000 From 14446caa271b848c6b36861b2c4b27e18f36eec9 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Fri, 4 Apr 2025 11:51:34 +0200 Subject: [PATCH 17/37] Add branching to CFG --- notebooks/dataflow_example.ipynb | 8 +-- src/cascade/core.py | 4 +- src/cascade/descriptors/method_descriptor.py | 8 +-- .../dataflow_graph_builder.py | 60 +++++++++------- .../generator/generate_split_functions.py | 12 ++-- src/cascade/frontend/generator/unparser.py | 15 +++- .../intermediate_representation/__init__.py | 2 +- .../control_flow_graph.py | 71 +++++++++++++++++++ .../statement_level_dataflow_graph.py | 27 ------- .../test_dataflow_graph_builder.py | 58 ++++++++++++++- .../dataflow_analysis/test_split_functions.py | 4 +- .../integration/pyruntime/if_else_branches.py | 48 +++++++++++++ tests/integration/pyruntime/test_programs.py | 25 +++++++ 13 files changed, 264 insertions(+), 78 deletions(-) create mode 100644 src/cascade/frontend/intermediate_representation/control_flow_graph.py delete mode 100644 src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py create mode 100644 tests/integration/pyruntime/if_else_branches.py diff --git a/notebooks/dataflow_example.ipynb b/notebooks/dataflow_example.ipynb index 90b3472..7649417 100644 --- a/notebooks/dataflow_example.ipynb +++ b/notebooks/dataflow_example.ipynb @@ -42,7 +42,7 @@ "from cascade.frontend.dataflow_analysis.class_list_builder import ClassListBuilder\n", "from cascade.frontend.dataflow_analysis.class_wrapper import ClassWrapper\n", "from cascade.frontend.util import setup_cfg, plot_graph_with_color, plot_dataflow_graph\n", - "from cascade.frontend.intermediate_representation import StatementDataflowGraph, DataflowGraph" + "from cascade.frontend.intermediate_representation import ControlFlowGraph, DataflowGraph" ] }, { @@ -97,7 +97,7 @@ "cfg = setup_cfg(example)\n", "class_list: ClassList = ClassListBuilder.build(cfg)\n", "entity_1: ClassWrapper = class_list.get_class_by_name('User')\n", - "dataflow_graph: StatementDataflowGraph = entity_1.methods['checkout']\n", + "dataflow_graph: ControlFlowGraph = entity_1.methods['checkout']\n", "G = dataflow_graph.graph\n", "grouper: GroupDataflowNodes = GroupDataflowNodes(G)\n", "groups = grouper.group_nodes()\n", @@ -125,7 +125,7 @@ "\n", "class_list: ClassList = ClassListBuilder.build(cfg)\n", "entity_1: ClassWrapper = class_list.get_class_by_name('User')\n", - "dataflow_graph: StatementDataflowGraph = entity_1.methods['checkout']\n", + "dataflow_graph: ControlFlowGraph = entity_1.methods['checkout']\n", "G = dataflow_graph.graph\n", "grouper: GroupDataflowNodes = GroupDataflowNodes(G)\n", "groups = grouper.group_nodes()\n", @@ -440,7 +440,7 @@ " cfg = setup_cfg(example)\n", " class_list: ClassList = ClassListBuilder.build(cfg)\n", " entity_1: ClassWrapper = class_list.get_class_by_name('User')\n", - " dataflow_graph: StatementDataflowGraph = entity_1.methods['buy_item']\n", + " dataflow_graph: ControlFlowGraph = entity_1.methods['buy_item']\n", " return dataflow_graph" ] }, diff --git a/src/cascade/core.py b/src/cascade/core.py index 830ebad..461d17c 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -12,7 +12,7 @@ from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements from cascade.frontend.generator.generate_dataflow import GenerateDataflow from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass, Operator -from cascade.frontend.intermediate_representation import StatementDataflowGraph +from cascade.frontend.intermediate_representation import ControlFlowGraph from cascade.frontend.generator.build_compiled_method_string import BuildCompiledMethodsString from cascade.frontend.ast_visitors import ExtractTypeVisitor @@ -115,7 +115,7 @@ def get_compiled_methods() -> str: for method_desc in cls_desc.methods_dec: if method_desc.method_name == '__init__': continue - dataflow_graph: StatementDataflowGraph = method_desc.dataflow + dataflow_graph: ControlFlowGraph = method_desc.dataflow instance_type_map: dict[str, str] = ExtractTypeVisitor.extract(method_desc.method_node) split_functions = GenerateSplitFunctions.generate(dataflow_graph, cls_desc.class_name, entities, instance_type_map) df: DataFlow = GenerateDataflow.generate(split_functions, instance_type_map) diff --git a/src/cascade/descriptors/method_descriptor.py b/src/cascade/descriptors/method_descriptor.py index 9f4b4aa..b61df7d 100644 --- a/src/cascade/descriptors/method_descriptor.py +++ b/src/cascade/descriptors/method_descriptor.py @@ -1,7 +1,7 @@ from klara.core import nodes -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder -from cascade.frontend.intermediate_representation import StatementDataflowGraph +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder +from cascade.frontend.intermediate_representation import ControlFlowGraph class MethodDescriptor: @@ -14,11 +14,11 @@ def __init__( ): self.method_name: str = method_name self.method_node: nodes.FunctionDef = method_node - self.dataflow: StatementDataflowGraph = None + self.dataflow: ControlFlowGraph = None def build_dataflow(self): statements = [self.method_node] + self.method_node.body - dataflow_graph: StatementDataflowGraph = DataflowGraphBuilder.build(statements) + dataflow_graph: ControlFlowGraph = ControlFlowGraphBuilder.build(statements) dataflow_graph.set_name(self.method_name) self.dataflow = dataflow_graph diff --git a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py index 332d204..e8c7262 100644 --- a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py +++ b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py @@ -2,37 +2,50 @@ from klara.core.cfg import ModuleLabel, TempAssignBlock -from klara.core.nodes import Name, FunctionDef +from klara.core import nodes -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph +from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph from cascade.frontend.ast_visitors import ContainsAttributeVisitor, VariableGetter -class DataflowGraphBuilder: +class ControlFlowGraphBuilder: def __init__(self, block_list: list): self.block_list: list = block_list - def extract_statment_list(self): - # TODO: This one should be extended with recursion to handle if/else branches - statements = [] - i = 0 - for b in self.block_list: + def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: + graph = ControlFlowGraph() + for b in blocks: if type(b) in [ModuleLabel, TempAssignBlock]: continue - elif type(b) == FunctionDef: - b: FunctionDef + elif type(b) == nodes.FunctionDef: statement = Statement(i, b) i += 1 args = b.args - function_vars = [Name.quick_build(f'{a.arg}_0') for a in args.args] + function_vars = [f'{a.arg}_0' for a in args.args] statement.extend_targets(function_vars) statement.extend_values(function_vars) - statements.append(statement) + graph.append_statement(statement) + elif type(b) == nodes.If: + + # Make subgraph of both branches + subgraph_body, i = self.make_cfg(b.body, i) + subgraph_orelse, i = self.make_cfg(b.orelse, i) + cond = Statement(i, b.test) + print(type(b.test)) + i += 1 + + # Add condition & branches to graph + graph.append_statement(cond) + graph.append_subgraph(cond, subgraph_body, type="True") + graph.append_subgraph(cond, subgraph_orelse, type="False") + + # The next node should connect to both subgraph + graph._last_node = subgraph_body._last_node + subgraph_orelse._last_node else: statement = Statement(i, b) i += 1 - statements.append(statement) + graph.append_statement(statement) variable_getter = VariableGetter.get_variable(b) targets, values = variable_getter.targets, variable_getter.values statement.targets = [t.__repr__() for t in targets] @@ -43,22 +56,15 @@ def extract_statment_list(self): statement.set_remote() statement.set_attribute(attribute) - return statements + + return graph, i - def construct_dataflow_graph(self) -> StatementDataflowGraph: - statements = self.extract_statment_list() - G = nx.DiGraph() - for b1 in statements: - G.add_node(b1) - for b2 in statements: - if b1.block_num != b2.block_num: - targets = set(repr(b) for b in b1.targets) - values = set(repr(b) for b in b2.values) - if targets.intersection(values): - G.add_edge(b1, b2) - return StatementDataflowGraph(G) + def construct_dataflow_graph(self) -> ControlFlowGraph: + graph, _ = self.make_cfg(self.block_list) + print(graph.to_dot()) + return graph @classmethod - def build(cls, block_list: list) -> StatementDataflowGraph: + def build(cls, block_list: list) -> ControlFlowGraph: dataflow_graph_builder = cls(block_list) return dataflow_graph_builder.construct_dataflow_graph() diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/generate_split_functions.py index eb2f94d..38943ae 100644 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ b/src/cascade/frontend/generator/generate_split_functions.py @@ -6,8 +6,8 @@ from cascade.dataflow.dataflow import DataFlow, DataflowRef, Edge from cascade.dataflow.operator import Block from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder +from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph from cascade.frontend.generator.split_function import SplitFunction, SplitFunction2, to_entity_call @@ -15,8 +15,8 @@ class GenerateSplitFunctions: - def __init__(self, dataflow_graph: StatementDataflowGraph, class_name: str, entity_map: dict[str, str]): - self.dataflow_graph: StatementDataflowGraph = dataflow_graph + def __init__(self, dataflow_graph: ControlFlowGraph, class_name: str, entity_map: dict[str, str]): + self.dataflow_graph: ControlFlowGraph = dataflow_graph self.class_name: str = class_name self.entity_map: dict[str, str] = entity_map # {"instance_name": "EntityType"} self.dataflow_node_map = dict() @@ -96,7 +96,7 @@ def get_all_simple_paths(self, G: nx.DiGraph, source: Statement, target: Stateme return nx.all_simple_paths(G, source=source, target=target) @classmethod - def generate(cls, dataflow_graph: StatementDataflowGraph, class_name: str, entity_map: dict[str, str]): + def generate(cls, dataflow_graph: ControlFlowGraph, class_name: str, entity_map: dict[str, str]): c = cls(dataflow_graph, class_name, entity_map) c.generate_split_functions() return c.split_functions @@ -113,7 +113,7 @@ def __init__(self, function_def: nodes.FunctionDef): self.function_def = function_def def build_cfg(self): - cfg: StatementDataflowGraph = DataflowGraphBuilder.build([self.function_def] + self.function_def.body) + cfg: ControlFlowGraph = ControlFlowGraphBuilder.build([self.function_def] + self.function_def.body) self.type_map = ExtractTypeVisitor.extract(self.function_def) cfg.name = self.function_def.name diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index e0d0177..76f57f0 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -2,8 +2,6 @@ from klara.core import nodes -from cascade.frontend. intermediate_representation import Statement - def unparse(block: RawBasicBlock): match type(block): case nodes.Return: @@ -29,5 +27,18 @@ def unparse(block: RawBasicBlock): return repr(block) case nodes.BinOp: return f'{unparse(block.left)} {block.op} {unparse(block.right)}' + case nodes.Subscript: + return str(block) + case nodes.Const: + return str(block) + case nodes.Compare: + return str(block) + case nodes.Bool: + return repr(block) + case nodes.If: + print(block.test, block.body, block.orelse) + raise NotImplementedError(type(block)) + case nodes.FunctionDef: + return str(block).replace('"', "'") case _: return str(block) diff --git a/src/cascade/frontend/intermediate_representation/__init__.py b/src/cascade/frontend/intermediate_representation/__init__.py index ddb00f3..36d6352 100644 --- a/src/cascade/frontend/intermediate_representation/__init__.py +++ b/src/cascade/frontend/intermediate_representation/__init__.py @@ -1,2 +1,2 @@ from .statement import Statement -from .statement_level_dataflow_graph import StatementDataflowGraph \ No newline at end of file +from .control_flow_graph import ControlFlowGraph \ No newline at end of file diff --git a/src/cascade/frontend/intermediate_representation/control_flow_graph.py b/src/cascade/frontend/intermediate_representation/control_flow_graph.py new file mode 100644 index 0000000..ea05940 --- /dev/null +++ b/src/cascade/frontend/intermediate_representation/control_flow_graph.py @@ -0,0 +1,71 @@ +from dataclasses import dataclass +from typing import Iterable +import networkx as nx + +from cascade.frontend.generator.unparser import unparse +from cascade.frontend.intermediate_representation.statement import Statement + + +@dataclass +class ControlFlowGraph: + """Control Flow Graph represented as a directed graph. + + Nodes are Statements, and edges are either PO/True/False. + """ + graph: nx.DiGraph + instance_type_map: dict[str, str] = None # {"instance_name": "EntityType"} + method_name: str = None + _last_node: list[Statement] = None + _source_node: Statement = None + + def __init__(self): + self.graph = nx.DiGraph() + self._last_node = [] + + def set_name(self, name: str): + self.name = name + + def append_statement(self, node: Statement): + self.graph.add_node(node) + + if not self._source_node: + self._source_node = node + + for ln in self._last_node: + self.graph.add_edge(ln, node) + self._last_node = [node] + + + def append_subgraph(self, to_node: Statement, subgraph: 'ControlFlowGraph', **edge_attr): + if subgraph.graph.number_of_nodes == 0: + return + for node in subgraph.get_nodes(): + self.graph.add_node(node) + for edge in subgraph.get_edges(): + self.graph.add_edge(edge[0], edge[1]) + assert subgraph._source_node + self.graph.add_edge(to_node, subgraph._source_node, **edge_attr) + + + def get_nodes(self) -> Iterable[Statement]: + return self.graph.nodes + + def get_edges(self) -> Iterable[tuple[int, int]]: + return [(u.block_num, v.block_num) for u, v in self.graph.edges] + + def get_source_node(self) -> Statement: + return self._source_node + + def to_dot(self) -> str: + dot_string = "digraph CFG {\n" + + # Add nodes + for node in self.get_nodes(): + dot_string += f' {node.block_num} [label="{unparse(node.block)}"];\n' + + # Add edges + for source, target, type in self.graph.edges.data('type', default='po'): + dot_string += f' {source.block_num} -> {target.block_num} [label="{type}"];\n' + + dot_string += "}" + return dot_string diff --git a/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py b/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py deleted file mode 100644 index 54e2900..0000000 --- a/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py +++ /dev/null @@ -1,27 +0,0 @@ -from dataclasses import dataclass -from typing import Iterable -import networkx as nx - -from cascade.frontend.intermediate_representation.statement import Statement - - -@dataclass -class StatementDataflowGraph: - """ Statement level dataflow graph. Capturs statement level data dependencies in a nx.DiGraph. - The nodes of the graph are Statements - """ - graph: nx.DiGraph - instance_type_map: dict[str, str] = None # {"instance_name": "EntityType"} - method_name: str = None - - def set_name(self, name: str): - self.name = name - - def get_nodes(self) -> Iterable[Statement]: - return self.graph.nodes - - def get_edges(self) -> Iterable[tuple[int, int]]: - return [(u.block_num, v.block_num) for u, v in self.graph.edges] - - def get_source_node(self) -> Statement: - return next(iter(self.get_nodes())) diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index 44cfa6f..0f9da5c 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -3,11 +3,36 @@ from klara.core.cfg import Cfg from klara.core import nodes -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder +from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph from cascade.frontend.util import setup_cfg +def test_linear_program(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock): + q1 = item1.get_quantity() + q2 = item2.get_quantity() + total = Adder.add(q1, q2) + return total""") + + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + # TODO: check that the produced ssa code made variables for + # - item1.get_quantity() + # - item2.get_quantity() + df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body) + for n in df.graph.nodes: + print(n) + for u, v in df.graph.edges: + print(u.block_num, v.block_num) + # print(df.graph.edges) + def test_ssa(): program: str = dedent(""" class Test: @@ -24,6 +49,33 @@ def get_total(item1: Stock, item2: Stock): # TODO: check that the produced ssa code made variables for # - item1.get_quantity() # - item2.get_quantity() - df: StatementDataflowGraph = DataflowGraphBuilder.build([get_total] + get_total.body) + df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body) print(df.graph.nodes) print(df.graph.edges) + + +def test_if_else_branches(): + program: str = dedent(""" + class Test: + + def test_branches(item1: Stock, item2: Stock): + q = item1.get_quantity() + cond = q < 10 + if cond: + a = item2.get_quantity() + else: + a = 0 + return a""") + + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + print(blocks) + test_class: nodes.Block = blocks[2] + test: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + # TODO: check that the produced ssa code made variables for + # - item1.get_quantity() + # - item2.get_quantity() + df: ControlFlowGraph = ControlFlowGraphBuilder.build([test] + test.body) + # print(df.graph.nodes) + # print(df.graph.edges) \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index 7afef23..dd62579 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -7,10 +7,10 @@ from cascade.dataflow.dataflow import DataFlow, DataflowRef from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder +from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements from cascade.frontend.generator.split_function import SplitFunction2, to_entity_call -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph +from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph from cascade.frontend.util import setup_cfg def test_split_functions(): diff --git a/tests/integration/pyruntime/if_else_branches.py b/tests/integration/pyruntime/if_else_branches.py new file mode 100644 index 0000000..024a8b8 --- /dev/null +++ b/tests/integration/pyruntime/if_else_branches.py @@ -0,0 +1,48 @@ +import cascade + +@cascade.cascade +class User: + def __init__(self, username: str, balance: int): + self.username = username + self.balance = balance + + def buy_item_easy(self, item: 'Item') -> int: + item_price = item.get_price() + cond = self.balance - item_price >= 0 + if cond: + self.balance = self.balance - item_price + else: + x = 10 + return self.balance + + # def buy_item_pred(self, item: 'Item') -> int: + # item_price = item.get_price() + # if self.balance - item_price >= 0: + # self.balance = self.balance - item_price + # return self.balance + + # def buy_item_else(self, item: 'Item') -> str: + # item_price = item.get_price() + # if self.balance - item_price >= 0: + # item_price = item.get_price() + # self.balance = self.balance - item_price + # return "item bought" + # else: + # item_price = item.get_price() + # msg = str(item_price) + " is too expensive!" + # return msg + + def __key__(self) -> str: + return self.username + +@cascade.cascade +class Item: + def __init__(self, item_name: str, price: int): + self.item_name = item_name + self.price = price + + def get_price(self) -> int: + return self.price + + def __key__(self) -> str: + return self.item_name diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index 0c27723..b6d4ed9 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -73,3 +73,28 @@ def test_operator_chaining(): event = a_op.dataflows["call_c_thru_b"].generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") result = client.send(event) assert result == 84 + +def test_branches(): + file_name = "if_else_branches.py" + + runtime, client = init_python_runtime(file_name) + item_op = cascade.core.operators["Item"] + user_op = cascade.core.operators["User"] + + for df in user_op.dataflows.values(): + print(df.to_dot()) + + event = item_op.dataflows["__init__"].generate_event({"item_name": "fork", "price": 10}, key="fork") + result = client.send(event) + assert result.price == 10 + assert result.item_name == "fork" + + event = item_op.dataflows["__init__"].generate_event({"item_name": "spoon", "price": 20}, key="spoon") + result = client.send(event) + assert result.price == 20 + assert result.__key__() == "spoon" + + event = user_op.dataflows["__init__"].generate_event({"username": "test", "balance": 15}, key="test") + user = client.send(event) + assert user.balance == 15 + assert user.__key__() == "test" \ No newline at end of file From 00220ac53abcb85eaee0859926cd962b237ba926 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Fri, 4 Apr 2025 16:47:54 +0200 Subject: [PATCH 18/37] Add blocked CFG --- deathstar_movie_review/entities/entities.py | 8 + src/cascade/core.py | 39 +-- src/cascade/dataflow/dataflow.py | 43 ++- src/cascade/dataflow/operator.py | 61 +++- .../dataflow_graph_builder.py | 8 +- .../frontend/generator/generate_dataflow.py | 101 +++--- .../generator/generate_split_functions.py | 323 +++++++++++++++--- .../frontend/generator/split_function.py | 46 ++- .../control_flow_graph.py | 44 ++- .../intermediate_representation/statement.py | 1 + .../dataflow_analysis/test_split_functions.py | 116 ++++++- 11 files changed, 569 insertions(+), 221 deletions(-) diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py index 028f0f4..a09c8d7 100644 --- a/deathstar_movie_review/entities/entities.py +++ b/deathstar_movie_review/entities/entities.py @@ -61,6 +61,10 @@ def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text title.upload_movie(review, rating) # text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! Text.upload_text_2(review, text) + + # TODO: promise pipelining + # uuid = UniqueId.generate() + # review.upload_unique_id(uuid) @cascade class UniqueId(): @@ -71,6 +75,10 @@ def upload_unique_id_2(review: ComposeReview): review_id = 424242 review.upload_unique_id(review_id) + @staticmethod + def generate() -> int: + return 424242 + @cascade class Text(): @staticmethod diff --git a/src/cascade/core.py b/src/cascade/core.py index 461d17c..7294ae2 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -4,17 +4,13 @@ from klara.core import nodes from klara.core.tree_rewriter import AstBuilder from klara.core.cfg import Cfg -from klara.core.node_classes import Arguments -from cascade.dataflow.operator import Block, StatefulOperator, StatelessOperator +from cascade.dataflow.operator import StatefulOperator, StatelessOperator, Operator from cascade.wrappers import ClassWrapper -from cascade.descriptors import ClassDescriptor, MethodDescriptor -from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements -from cascade.frontend.generator.generate_dataflow import GenerateDataflow -from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass, Operator -from cascade.frontend.intermediate_representation import ControlFlowGraph -from cascade.frontend.generator.build_compiled_method_string import BuildCompiledMethodsString -from cascade.frontend.ast_visitors import ExtractTypeVisitor +from cascade.descriptors import ClassDescriptor +from cascade.frontend.generator.generate_split_functions import GroupStatements +from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass + def setup_cfg(code: str) -> Cfg: as_tree = AstBuilder().string_build(code) @@ -98,31 +94,6 @@ def init(): for b in blocks: op.methods[b.name] = b - - - -def get_entity_names() -> str: - """Returns a list with the names of all registered entities""" - return [cls.class_desc.class_name for cls in registered_classes] - - -def get_compiled_methods() -> str: - """Returns a list with the compiled methods as string""" - compiled_methods: list[str] = [] - entities: list[str] = get_entity_names() - for cls in registered_classes: - cls_desc: ClassDescriptor = cls.class_desc - for method_desc in cls_desc.methods_dec: - if method_desc.method_name == '__init__': - continue - dataflow_graph: ControlFlowGraph = method_desc.dataflow - instance_type_map: dict[str, str] = ExtractTypeVisitor.extract(method_desc.method_node) - split_functions = GenerateSplitFunctions.generate(dataflow_graph, cls_desc.class_name, entities, instance_type_map) - df: DataFlow = GenerateDataflow.generate(split_functions, instance_type_map) - class_compiled_methods: str = BuildCompiledMethodsString.build(split_functions) - compiled_methods.append(class_compiled_methods) - - return '\n\n'.join(compiled_methods) def clear(): diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 4b6cb80..15ca0a7 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -4,24 +4,10 @@ from typing import TYPE_CHECKING import uuid -import cascade - if TYPE_CHECKING: - # Prevent circular imports - from cascade.dataflow.operator import Block - - -class Operator(ABC): - dataflows: dict[str, 'DataFlow'] - methods: Mapping[str, 'Block'] - - @abstractmethod - def name(self) -> str: - pass + from cascade.frontend.generator.split_function import LocalBlock + from cascade.dataflow.operator import Operator - def get_method_rw_set(self, method_name: str) -> tuple[list[str], list[str]]: - method = self.methods[method_name] - return method.var_map_reads, method.var_map_writes @dataclass class InitClass: @@ -56,13 +42,17 @@ def __post_init__(self): def propogate(self, event: 'Event', targets: list['Node'], result: Any, **kwargs) -> list['Event']: pass +class IfNode(Node): + def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: + return super().propogate(event, targets, result, **kwargs) + @dataclass class DataflowRef: operator_name: str dataflow_name: str - def get_dataflow(self) -> 'DataFlow': - return cascade.core.dataflows[self] + # def get_dataflow(self) -> 'DataFlow': + # return cascade_core.dataflows[self] def __repr__(self) -> str: return f"{self.operator_name}.{self.dataflow_name}" @@ -204,9 +194,10 @@ def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.args: list[str] = args else: self.args = [] + self.blocks: dict[str, 'LocalBlock'] = {} - def get_operator(self) -> Operator: - return cascade.core.operators[self.op_name] + # def get_operator(self) -> Operator: + # return cascade.core.operators[self.op_name] def add_node(self, node: Node): """Add a node to the Dataflow graph if it doesn't already exist.""" @@ -214,6 +205,9 @@ def add_node(self, node: Node): self.adjacency_list[node.id] = [] self.nodes[node.id] = node + def add_block(self, block: 'LocalBlock'): + self.blocks[block.get_method_name()] = block + def add_edge(self, edge: Edge): """Add an edge to the Dataflow graph. Nodes that don't exist will be added to the graph automatically.""" self.add_node(edge.from_node) @@ -222,6 +216,11 @@ def add_edge(self, edge: Edge): self.adjacency_list[edge.from_node.id].append(edge.to_node.id) edge.from_node.outgoing_edges.append(edge) + def add_edge_refs(self, u: int, v: int, if_conditional=None): + """Add an edge using node IDs""" + from_node = self.nodes[u] + to_node = self.nodes[v] + self.add_edge(Edge(from_node, to_node, if_conditional=if_conditional)) def remove_edge(self, from_node: Node, to_node: Node): """Remove an edge from the Dataflow graph.""" @@ -357,15 +356,13 @@ class Event(): """A mapping of variable identifiers to values. If `target` is an `OpNode` this map should include the variables needed for that method.""" - dataflow: DataFlow + dataflow: DataflowRef """The Dataflow that this event is a part of. If None, it won't propogate. This might be remove in the future in favour of a routing operator.""" _id: int = field(default=None) # type: ignore (will get updated in __post_init__ if unset) """Unique ID for this event. Except in `propogate`, this `id` should not be set.""" - # collect_target: Optional[CollectTarget] = field(default=None) - # """Tells each mergenode (key) how many events to merge on""" call_stack: List[CallStackItem] = field(default_factory=list) """Target used when dataflow is done, used for recursive dataflows.""" diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index 7385202..e9c0439 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -1,11 +1,24 @@ from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, Union -from cascade.dataflow.dataflow import CallLocal, DataFlow, InitClass, InvokeMethod, Operator +from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, TYPE_CHECKING + +if TYPE_CHECKING: + from cascade.frontend.generator.split_function import LocalBlock + from cascade.dataflow.dataflow import DataFlow, InvokeMethod T = TypeVar('T') +class Operator(ABC): + dataflows: dict[str, 'DataFlow'] + methods: Mapping[str, 'LocalBlock'] + + @abstractmethod + def name(self) -> str: + pass + def get_method_rw_set(self, method_name: str) -> tuple[set[str], set[str]]: + method = self.methods[method_name] + return method.reads, method.writes + class MethodCall(Generic[T], Protocol): """A helper class for type-safety of method signature for compiled methods. @@ -26,16 +39,27 @@ def my_compiled_method(variable_map: dict[str, Any], state: T) -> Any def __call__(self, variable_map: dict[str, Any], state: T) -> Any: ... """@private""" -@dataclass -class Block(ABC): - var_map_writes: list[str] - var_map_reads: list[str] - name: str - function_call: Union[MethodCall, 'StatelessMethodCall'] - raw_method_string: str +# @dataclass +# class LocalBlock: +# var_map_writes: set[str] +# var_map_reads: set[str] +# name: str +# statements: +# function_call: Union[MethodCall, 'StatelessMethodCall'] +# raw_method_string: str + +# def call(self, *args, **kwargs) -> Any: +# return self.function_call(*args, **kwargs) + +# def merge_with(self, other: 'LocalBlock'): +# self.var_map_writes.update(other.var_map_writes) +# self.var_map_reads.update(other.var_map_reads) - def call(self, *args, **kwargs) -> Any: - return self.function_call(*args, **kwargs) +# local_scope = {} +# raw_str = self.to_string() +# exec(self.to_string(), {}, local_scope) +# method_name = self.get_method_name() +# fn = local_scope[method_name] class StatelessMethodCall(Protocol): @@ -55,7 +79,7 @@ class StatefulOperator(Generic[T], Operator): methods, instead reading and modifying the underlying class `T` through a state variable, see `handle_invoke_method`. """ - def __init__(self, entity: Type[T], methods: dict[str, Block], dataflows: dict[str, DataFlow]): + def __init__(self, entity: Type[T], methods: dict[str, 'LocalBlock'], dataflows: dict[str, 'DataFlow']): """Create the StatefulOperator from a class and its compiled methods. Typically, a class could be comprised of split and non-split methods. Take the following example: @@ -114,7 +138,7 @@ def handle_init_class(self, *args, **kwargs) -> T: """Create an instance of the underlying class. Equivalent to `T.__init__(*args, **kwargs)`.""" return self.entity(*args, **kwargs) - def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any], state: T): + def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, Any], state: T): """Invoke the method of the underlying class. The `cascade.dataflow.dataflow.InvokeMethod` object must contain a method identifier @@ -122,7 +146,7 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self.methods[method.method_name].call(variable_map=variable_map, state=state) + return self.methods[method.method_name].call_block(variable_map=variable_map, state=state) def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) @@ -135,12 +159,13 @@ def name(self): class StatelessOperator(Operator): """A StatelessOperator refers to a stateless function and therefore only has one dataflow.""" - def __init__(self, entity: Type, methods: dict[str, Block], dataflows: dict[str, DataFlow]): + def __init__(self, entity: Type, methods: dict[str, 'LocalBlock'], dataflows: dict[str, 'DataFlow']): self.entity = entity + # TODO: extract this from dataflows.blocks self.methods = methods self.dataflows = dataflows - def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any]): + def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, Any]): """Invoke the method of the underlying class. The `cascade.dataflow.dataflow.InvokeMethod` object must contain a method identifier @@ -148,7 +173,7 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self.methods[method.method_name].call(variable_map=variable_map, state=None) + return self.methods[method.method_name].call_block(variable_map=variable_map, state=None) def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) diff --git a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py index e8c7262..5cf3f22 100644 --- a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py +++ b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py @@ -31,14 +31,13 @@ def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: # Make subgraph of both branches subgraph_body, i = self.make_cfg(b.body, i) subgraph_orelse, i = self.make_cfg(b.orelse, i) - cond = Statement(i, b.test) - print(type(b.test)) + cond = Statement(i, b.test, is_predicate=True) i += 1 # Add condition & branches to graph graph.append_statement(cond) - graph.append_subgraph(cond, subgraph_body, type="True") - graph.append_subgraph(cond, subgraph_orelse, type="False") + graph.append_subgraph(cond, subgraph_body, type=True) + graph.append_subgraph(cond, subgraph_orelse, type=False) # The next node should connect to both subgraph graph._last_node = subgraph_body._last_node + subgraph_orelse._last_node @@ -61,7 +60,6 @@ def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: def construct_dataflow_graph(self) -> ControlFlowGraph: graph, _ = self.make_cfg(self.block_list) - print(graph.to_dot()) return graph @classmethod diff --git a/src/cascade/frontend/generator/generate_dataflow.py b/src/cascade/frontend/generator/generate_dataflow.py index a96189e..1a30b72 100644 --- a/src/cascade/frontend/generator/generate_dataflow.py +++ b/src/cascade/frontend/generator/generate_dataflow.py @@ -1,60 +1,59 @@ -from cascade.dataflow.operator import Block -from cascade.frontend.generator.split_function import SplitFunction -from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod, Edge +# from cascade.frontend.generator.split_function import LocalBlock, SplitFunction +# from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod, Edge -class GenerateDataflow: - """ Generates dataflow - """ +# class GenerateDataflow: +# """ Generates dataflow +# """ - def __init__(self, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args): - #TODO: add buildcontext that contains class name and target method - self.split_functions = split_functions - self.df = DataFlow(method_name, op_name, args) - self.blocks: list[Block] = [] - self.instance_type_map = instance_type_map +# def __init__(self, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args): +# #TODO: add buildcontext that contains class name and target method +# self.split_functions = split_functions +# self.df = DataFlow(method_name, op_name, args) +# self.blocks: list[LocalBlock] = [] +# self.instance_type_map = instance_type_map - def generate_dataflow(self): - self.extract_remote_method_calls() - self.build_dataflow() +# def generate_dataflow(self): +# self.extract_remote_method_calls() +# self.build_dataflow() - def build_dataflow(self): - """ Every remote function invocation should add the node - """ - nodes = [] - for split in self.split_functions: - node = CallLocal(InvokeMethod(split.method_name)) - self.df.add_node(node) - nodes.append([node]) +# def build_dataflow(self): +# """ Every remote function invocation should add the node +# """ +# nodes = [] +# for split in self.split_functions: +# node = CallLocal(InvokeMethod(split.method_name)) +# self.df.add_node(node) +# nodes.append([node]) - if split.remote_calls: - # TODO: instance_name -> correct entity (maybe using buildcontext/ instance type map) - next_nodes = [] - for remote in split.remote_calls: - df = DataflowRef(self.instance_type_map[remote.instance_name], remote.attribute) - args = df.get_dataflow.args - # TODO: proper variable renaming - vars = {arg: arg for arg in args} - call = CallEntity(df, vars, assign_result_to=remote.target) - next_nodes.append(call) - nodes.append(next_nodes) +# if split.remote_calls: +# # TODO: instance_name -> correct entity (maybe using buildcontext/ instance type map) +# next_nodes = [] +# for remote in split.remote_calls: +# df = DataflowRef(self.instance_type_map[remote.instance_name], remote.attribute) +# args = df.get_dataflow.args +# # TODO: proper variable renaming +# vars = {arg: arg for arg in args} +# call = CallEntity(df, vars, assign_result_to=remote.target) +# next_nodes.append(call) +# nodes.append(next_nodes) - self.df.entry = nodes[0][0] - for i in range(len(nodes)-1): - # TODO: add merge nodes - prev_nodes = nodes[i] - next_nodes = nodes[i+1] - for n in prev_nodes: - for v in next_nodes: - # TODO: Add variable map (think that should be the aggregation of the targets) - self.df.add_edge(Edge(n, v)) +# self.df.entry = nodes[0][0] +# for i in range(len(nodes)-1): +# # TODO: add merge nodes +# prev_nodes = nodes[i] +# next_nodes = nodes[i+1] +# for n in prev_nodes: +# for v in next_nodes: +# # TODO: Add variable map (think that should be the aggregation of the targets) +# self.df.add_edge(Edge(n, v)) - def extract_remote_method_calls(self): - for split in self.split_functions: - split.extract_remote_method_calls() +# def extract_remote_method_calls(self): +# for split in self.split_functions: +# split.extract_remote_method_calls() - @classmethod - def generate(cls, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args) -> tuple[DataFlow, list[Block]]: - c = cls(split_functions, instance_type_map, method_name, op_name, args) - c.generate_dataflow() - return c.df, c.blocks \ No newline at end of file +# @classmethod +# def generate(cls, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args) -> tuple[DataFlow, list[LocalBlock]]: +# c = cls(split_functions, instance_type_map, method_name, op_name, args) +# c.generate_dataflow() +# return c.df, c.blocks \ No newline at end of file diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/generate_split_functions.py index 38943ae..a9a278e 100644 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ b/src/cascade/frontend/generator/generate_split_functions.py @@ -3,12 +3,11 @@ import networkx as nx -from cascade.dataflow.dataflow import DataFlow, DataflowRef, Edge -from cascade.dataflow.operator import Block +from cascade.dataflow.dataflow import DataFlow, DataflowRef, Edge, IfNode, Node from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph -from cascade.frontend.generator.split_function import SplitFunction, SplitFunction2, to_entity_call +from cascade.frontend.generator.split_function import SplitFunction, LocalBlock, to_entity_call from klara.core import nodes @@ -102,6 +101,108 @@ def generate(cls, dataflow_graph: ControlFlowGraph, class_name: str, entity_map: return c.split_functions +def split_cfg(blocked_statement_graph: nx.DiGraph) -> nx.DiGraph: + pass + +def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: + """Transform a cfg (digraph of Statements) into a blocked version, i.e. a + digraph of tuple(Statements). This pass blocks together the body and orelse + branches of if blocks, grouping them together. + This pass treats remote calls as any other statement. + + For example, take the cfg of the following program: + + ``` + a = 10 + b = 20 + if x: + c = 30 + d = 20 + else: + e = 10 + f = 10 + ``` + + it will get split into the following blocks: + + ``` + block 1: + a = 10 + b = 20 + if x: + + block 2: + c = 30 + d = 20 + + block 3: + e = 10 + + block 4: + f = 10 + ``` + """ + + + grouped_statements = [entry] + + succ = list(statement_graph.successors(entry)) + while len(succ) == 1: + if len(list(statement_graph.predecessors(succ[0]))) > 1: + break + grouped_statements.append(succ[0]) + succ = list(statement_graph.successors(succ[0])) + + + graph = nx.DiGraph() + + if len(succ) == 0 or len(succ) == 1: + last_node = tuple(grouped_statements) + graph.add_node(last_node) + return graph + if len(succ) == 2: + if len(grouped_statements) > 1: + before_if, last_node = tuple(grouped_statements[:-1]), tuple([grouped_statements[-1]]) + graph.add_edge(before_if, last_node) + else: + last_node = tuple(grouped_statements) + graph.add_node(last_node) + # TODO: check that then corresponds to "true" path + first_then, first_orelse = succ + then_blocked_graph = blocked_cfg(statement_graph, first_then) + orelse_blocked_graph = blocked_cfg(statement_graph, first_orelse) + last_then = list(then_blocked_graph.nodes)[-1] + last_orelse = list(orelse_blocked_graph.nodes)[-1] + + # check the first node after completed + succ_then = list(statement_graph.successors(last_then[-1])) + succ_orelse = list(statement_graph.successors(last_orelse[-1])) + assert len(succ_then) == 1 + assert len(succ_orelse) == 1 + assert succ_orelse[0] == succ_then[0] + + first_finally = succ_orelse[0] + finally_graph = blocked_cfg(statement_graph, first_finally) + + graph.add_edges_from(then_blocked_graph.edges()) + graph.add_edges_from(orelse_blocked_graph.edges()) + graph.add_edges_from(finally_graph.edges()) + + + first_then = list(then_blocked_graph.nodes)[0] + first_orelse = list(orelse_blocked_graph.nodes)[0] + first_finally = list(finally_graph.nodes)[0] + + graph.add_edge(last_node, first_then) + graph.add_edge(last_node, first_orelse) + graph.add_edge(last_then, first_finally) + graph.add_edge(last_orelse, first_finally) + return graph + else: + raise ValueError(succ) + + + class GroupStatements: # todo: cfg should be control flow graph, statements should also be a graph @@ -116,83 +217,195 @@ def build_cfg(self): cfg: ControlFlowGraph = ControlFlowGraphBuilder.build([self.function_def] + self.function_def.body) self.type_map = ExtractTypeVisitor.extract(self.function_def) cfg.name = self.function_def.name - - statements = list(cfg.get_nodes()) - statements.sort(key=lambda s: s.block_num) - self.statements = statements # TODO: for more complex control flow, use CFG structure instead + for n in cfg.get_nodes(): + print(n) + # statements = list(cfg.get_nodes()) + # statements.sort(key=lambda s: s.block_num) + # self.statements = statements # TODO: for more complex control flow, use CFG structure instead self._grouped_statements: List[List[Statement]] = [] self.cfg = cfg + self.blocked_cfg = blocked_cfg(cfg.graph, cfg.get_single_source()) + + def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: + entry_node: Statement = self.cfg.get_source_nodes()[0] + assert type(entry_node.block) == nodes.FunctionDef + self.cfg.remove_node(entry_node) + + df_ref = DataflowRef(op_name, self.cfg.name) + df = dataflows[df_ref] + + node_id_map = {} + + block_num = 0 + for statement_block in self.blocked_cfg.nodes: + if len(statement_block) == 1 and statement_block[0].is_remote(): + node = to_entity_call(statement_block[0], self.type_map, dataflows) + elif len(statement_block) == 1 and statement_block[0].is_predicate: + node = IfNode() + else: + block = LocalBlock(list(statement_block), self.cfg.name, block_num, op_name) + block_num += 1 + node = block.to_node() + df.add_block(block) + node_id_map[statement_block] = node.id + df.add_node(node) + + for source, target, if_result in self.blocked_cfg.edges.data('type', default=None): + source_id = node_id_map[source] + target_id = node_id_map[target] + df.add_edge_refs(source_id, target_id, if_result) + + return df - def generate_grouped_statements(self) -> List[List[Statement]]: - entry_node: Statement = self.statements[0] + def build_df_old(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: + entry_node: Statement = self.cfg.get_source_nodes()[0] assert type(entry_node.block) == nodes.FunctionDef - grouped_statements = [] - continuation = self.statements[1:] - while len(continuation) > 0: - first_half, continuation = self.split_statements(continuation) - grouped_statements.append(first_half) + df_ref = DataflowRef(op_name, self.cfg.name) + df = dataflows[df_ref] + + last_node = None + block_num = 0 + + while len(self.cfg.graph) > 0: + print(df.to_dot()) + + source = self.cfg.get_single_source() + if source is not None and source.is_predicate: + node = IfNode() + self.cfg.remove_node(source) + else: + group = self.split_graph(self.cfg) + + if len(group) == 1 and group[0].is_remote(): + # Entity call + node = to_entity_call(group[0], self.type_map, dataflows) + else: + # Group statements together, into a block + block = LocalBlock(group, self.cfg.name, block_num, op_name) + block_num += 1 + node = block.to_node() + print(block.to_string()) + df.blocks[block.get_method_name()] = block + + if last_node == None: + last_node = node + df.add_node(node) + df.entry = [node] + else: + df.add_edge(Edge(last_node, node)) + last_node = node + + return df + + def split_graph(self, graph: ControlFlowGraph) -> list[Statement]: + if len(graph.graph) == 0: + return [] + + source = graph.get_source_nodes()[0] + if source.is_remote(): + graph.remove_node(source) + return [source] + + # find the next remote call + local_group = [source] + node = graph.get_single_successor(source) + graph.remove_node(source) - self._grouped_statements = grouped_statements - return grouped_statements + while node is not None and not node.is_remote() and not node.is_predicate: + if len(list(graph.graph.predecessors(node))) > 1: + break + local_group.append(node) + succ = graph.get_single_successor(node) + graph.remove_node(node) + node = succ + + return local_group - def split_statements(self, statements: list[Statement]) -> tuple[list[Statement], list[Statement]]: - """ + """ Split a list of statements, by grouping together statements that are not remote calls. + The graph becomes a subgraph, with the statments removed. As an example, suppose r and s are both statements, where r is a remote call and s is not. Here is how the list gets split: [r, s, s, r, s] -> [r] + [s, s, r, s] + [r, r, s, r, s] -> [r] + [r, s, r, s] [s, s, r, s, s] -> [s, s] + [r, s, s] + [s, r, r, s, s] -> [s] + [r, r, s, s] [s, s, s] -> [s, s, s] + [] """ - assert len(statements) > 0 - if statements[0].is_remote(): - return [statements[0]], statements[1:] + # def generate_grouped_statements(self) -> List[List[Statement]]: + # entry_node: Statement = self.statements[0] + # assert type(entry_node.block) == nodes.FunctionDef + + # grouped_statements = [] + # continuation = self.statements[1:] + # while len(continuation) > 0: + # first_half, continuation = self.split_statements(continuation) + # grouped_statements.append(first_half) + + # self._grouped_statements = grouped_statements + # return grouped_statements + + # def split_statements(self, statements: list[Statement]) -> tuple[list[Statement], list[Statement]]: + # """ + # Split a list of statements, by grouping together statements that are not remote calls. + + # As an example, suppose r and s are both statements, where r is a remote call and s is not. + + # Here is how the list gets split: + # [r, s, s, r, s] -> [r] + [s, s, r, s] + # [s, s, r, s, s] -> [s, s] + [r, s, s] + # [s, s, s] -> [s, s, s] + [] + # """ + # assert len(statements) > 0 + + # if statements[0].is_remote(): + # return [statements[0]], statements[1:] - # find the next remote call - i = 0 - first_half = [] - while i < len(statements) and not statements[i].is_remote(): - first_half.append(statements[i]) - i += 1 + # # find the next remote call + # i = 0 + # first_half = [] + # while i < len(statements) and not statements[i].is_remote(): + # first_half.append(statements[i]) + # i += 1 - continuation = statements[i:] - return first_half, continuation + # continuation = statements[i:] + # return first_half, continuation - def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> tuple[DataFlow, List[Block]]: - self.build_cfg() + # def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> tuple[DataFlow, List[LocalBlock]]: + # self.build_cfg() - self.generate_grouped_statements() + # self.generate_grouped_statements() - blocks = [] - block_num = 0 + # blocks = [] + # block_num = 0 - df_ref = DataflowRef(op_name, self.cfg.name) - df = dataflows[df_ref] + # df_ref = DataflowRef(op_name, self.cfg.name) + # df = dataflows[df_ref] - last_node = None - for split in self._grouped_statements: - if len(split) == 1 and split[0].is_remote(): - # Entity call - node = to_entity_call(split[0], self.type_map, dataflows) - else: - # Group statements together, into a block - s = SplitFunction2(split, self.cfg.name, block_num, op_name) - block_num += 1 - node, block = s.to_block() - blocks.append(block) + # last_node = None + # for split in self._grouped_statements: + # if len(split) == 1 and split[0].is_remote(): + # # Entity call + # node = to_entity_call(split[0], self.type_map, dataflows) + # else: + # # Group statements together, into a block + # block = LocalBlock(split, self.cfg.name, block_num, op_name) + # block_num += 1 + # node = block.to_node() + # blocks.append(block) - if last_node == None: - last_node = node - df.add_node(node) - df.entry = [node] - else: - df.add_edge(Edge(last_node, node)) - last_node = node + # if last_node == None: + # last_node = node + # df.add_node(node) + # df.entry = [node] + # else: + # df.add_edge(Edge(last_node, node)) + # last_node = node - return df, blocks + # return df, blocks diff --git a/src/cascade/frontend/generator/split_function.py b/src/cascade/frontend/generator/split_function.py index e0618d5..e10587d 100644 --- a/src/cascade/frontend/generator/split_function.py +++ b/src/cascade/frontend/generator/split_function.py @@ -1,19 +1,20 @@ from textwrap import indent from dataclasses import dataclass, field -from typing import Union +from typing import Any, Union, TYPE_CHECKING -from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod -from cascade.dataflow.operator import Block from cascade.frontend.util import to_camel_case from cascade.frontend.intermediate_representation import Statement from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState from cascade.frontend.generator.unparser import unparse from cascade.frontend.generator.remote_call import RemoteCall +from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod from klara.core.cfg import RawBasicBlock from klara.core import nodes -from klara.core.node_classes import Name + +if TYPE_CHECKING: + from cascade.dataflow.operator import MethodCall, StatelessMethodCall @dataclass class SplitFunction: @@ -111,16 +112,16 @@ def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: di return CallEntity(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,keyby=key) -class SplitFunction2: +class LocalBlock: def __init__(self, statements: list[Statement], method_base_name: str, block_num: int, class_name: str): assert len(statements) > 0 # A block of statements should have no remote calls assert all([not s.is_remote() for s in statements]) - self.statements = statements - self.method_base_name = method_base_name - self.class_name = class_name - self.block_num = block_num + self.statements: list[Statement] = statements + self.method_base_name: str = method_base_name + self.block_num: int = block_num + self.class_name: str = class_name writes, reads = set(), set() for s in statements: @@ -142,17 +143,28 @@ def __init__(self, statements: list[Statement], method_base_name: str, block_num # writes.update - self.reads = reads - self.writes = writes - + self.reads: set[str] = reads + self.writes: set[str] = writes + self.function: Union['MethodCall', 'StatelessMethodCall'] = None + self.compile_function() - def to_block(self) -> tuple[CallLocal, Block]: + def call_block(self, *args, **kwargs) -> Any: + assert self.function is not None + return self.function(*args, **kwargs) + + def compile_function(self): local_scope = {} - raw_str = self.to_string() exec(self.to_string(), {}, local_scope) method_name = self.get_method_name() - fn = local_scope[method_name] - return CallLocal(InvokeMethod(method_name)), Block(list(self.writes), list(self.reads), method_name, fn, raw_str) + self.function = local_scope[method_name] + + def merge_with(self, other: 'LocalBlock'): + self.reads.update(other.reads) + self.writes.update(other.writes) + self.compile_function() + + def to_node(self) -> CallLocal: + return CallLocal(InvokeMethod(self.get_method_name())) def get_method_name(self): return f"{self.method_base_name}_{self.block_num}" @@ -180,6 +192,8 @@ def body_to_string(self) -> str: block: RawBasicBlock = statement.block if type(block) == nodes.FunctionDef: continue + + # TODO: do this in preprocessing ReplaceSelfWithState.replace(block) body.append(unparse(block)) diff --git a/src/cascade/frontend/intermediate_representation/control_flow_graph.py b/src/cascade/frontend/intermediate_representation/control_flow_graph.py index ea05940..296e258 100644 --- a/src/cascade/frontend/intermediate_representation/control_flow_graph.py +++ b/src/cascade/frontend/intermediate_representation/control_flow_graph.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Iterable +from typing import Iterable, Optional import networkx as nx from cascade.frontend.generator.unparser import unparse @@ -16,10 +16,11 @@ class ControlFlowGraph: instance_type_map: dict[str, str] = None # {"instance_name": "EntityType"} method_name: str = None _last_node: list[Statement] = None - _source_node: Statement = None + _sources: list[Statement] = None def __init__(self): self.graph = nx.DiGraph() + self._sources = [] self._last_node = [] def set_name(self, name: str): @@ -28,8 +29,8 @@ def set_name(self, name: str): def append_statement(self, node: Statement): self.graph.add_node(node) - if not self._source_node: - self._source_node = node + if len(self._sources) == 0: + self._sources = [node] for ln in self._last_node: self.graph.add_edge(ln, node) @@ -41,11 +42,36 @@ def append_subgraph(self, to_node: Statement, subgraph: 'ControlFlowGraph', **ed return for node in subgraph.get_nodes(): self.graph.add_node(node) - for edge in subgraph.get_edges(): + for edge in subgraph.graph.edges: self.graph.add_edge(edge[0], edge[1]) - assert subgraph._source_node - self.graph.add_edge(to_node, subgraph._source_node, **edge_attr) + assert len((s:=subgraph.get_source_nodes())) == 1 + self.graph.add_edge(to_node, s[0], **edge_attr) + def remove_node(self, node: Statement): + """Remove a node and it's adjacent edges""" + if node == self.get_single_source(): + succ = list(self.graph.successors(node)) + # assert len(succ) <= 1, "Can't remove node with more than one successor" + self._sources = succ + if node == self._last_node: + raise NotImplementedError("Update last node") + + self.graph.remove_node(node) + + def get_single_source(self,) -> Optional[Statement]: + """Get the source of this CFG. Returns None if there are 0 or 2+ sources.""" + if len(self._sources) == 1: + return self._sources[0] + else: + return None + + def get_single_successor(self, node: Statement) -> Optional[Statement]: + """Get the successor of this node. Returns None if there are 0 or 2+ successors.""" + succ = list(self.graph.successors(node)) + if len(succ) == 1: + return succ[0] + else: + return None def get_nodes(self) -> Iterable[Statement]: return self.graph.nodes @@ -53,8 +79,8 @@ def get_nodes(self) -> Iterable[Statement]: def get_edges(self) -> Iterable[tuple[int, int]]: return [(u.block_num, v.block_num) for u, v in self.graph.edges] - def get_source_node(self) -> Statement: - return self._source_node + def get_source_nodes(self) -> list[Statement]: + return self._sources def to_dot(self) -> str: dot_string = "digraph CFG {\n" diff --git a/src/cascade/frontend/intermediate_representation/statement.py b/src/cascade/frontend/intermediate_representation/statement.py index e20db42..a8e4783 100644 --- a/src/cascade/frontend/intermediate_representation/statement.py +++ b/src/cascade/frontend/intermediate_representation/statement.py @@ -10,6 +10,7 @@ class Statement: targets: list[str] = field(default_factory=list) values: list[str] = field(default_factory=list) remote_call: bool = False + is_predicate: bool = False attribute: Attribute = None def extend_targets(self, new_targets: list[str]): diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index dd62579..ad5659d 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -6,14 +6,11 @@ from klara.core import nodes from cascade.dataflow.dataflow import DataFlow, DataflowRef -from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder -from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions, GroupStatements -from cascade.frontend.generator.split_function import SplitFunction2, to_entity_call -from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph +from cascade.frontend.generator.generate_split_functions import GroupStatements, blocked_cfg +from cascade.frontend.intermediate_representation.control_flow_graph import ControlFlowGraph from cascade.frontend.util import setup_cfg -def test_split_functions(): +def test_entity_calls(): program: str = dedent(""" class Test: @@ -42,9 +39,108 @@ def get_total(item1: Stock, item2: Stock, y: int): } - # TODO: Check - statements = sf.generate_grouped_statements() + # TODO: Check # entity calls, # of blocks, # of local calls - df, blocks = sf.build(dataflows, "Test") + df = sf.build_df(dataflows, "Test") print(df.to_dot()) - print(blocks) \ No newline at end of file + for block in df.blocks.values(): + print(block.to_string()) + +def test_branching(): + program: str = dedent(""" + class Test: + def test_branching(self) -> int: + pre = 10 + if True: + then = 20 + and_then = 10 + else: + orelse = 30 + orelser = 30 + post = 40 + return 50""") + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = GroupStatements(get_total) + sf.build_cfg() + print(sf.cfg.to_dot()) + new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) + for node in new.nodes: + for s in node: + print(s.block_num, end=" ") + print() + for edge in new.edges: + for s in edge[0]: + print(s.block_num, end=" ") + print("->", end= " ") + for s in edge[1]: + print(s.block_num, end=" ") + print() + + dataflows = { + DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []) + } + + + df = sf.build_df(dataflows, "Test") + print(df.to_dot()) + for block in df.blocks.values(): + print(block.to_string()) + assert len(df.blocks) == 4 + assert len(df.nodes) == 5 + +def test_branching_with_entity_calls(): + program: str = dedent(""" + class Test: + def test_branching(self) -> int: + pre = 10 + if True: + then = 10 + and_then = 10 + else: + orelse = 30 + y = 10 + orelser = Entity.call() + orelserer = 40 + x = 10 + post = 40 + return 50""") + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = GroupStatements(get_total) + sf.build_cfg() + print(sf.cfg.to_dot()) + new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) + for node in new.nodes: + for s in node: + print(s.block_num, end=" ") + print() + for edge in new.edges: + for s in edge[0]: + print(s.block_num, end=" ") + print("->", end= " ") + for s in edge[1]: + print(s.block_num, end=" ") + print() + + dataflows = { + DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []), + DataflowRef("Entity", "call"): DataFlow("call", "Entity", []) + } + + + # TODO: Check # entity calls, # of blocks, # of local calls + + df = sf.build_df(dataflows, "Test") + print(df.to_dot()) + for block in df.blocks.values(): + print(block.to_string()) + +def test_block_merging(): + raise NotImplementedError() \ No newline at end of file From 74e3bc846482161b81d0724e617b31ecef0524ec Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Mon, 7 Apr 2025 12:17:19 +0200 Subject: [PATCH 19/37] Implement split function on entities for new IR --- src/cascade/core.py | 16 +- src/cascade/dataflow/dataflow.py | 4 +- .../dataflow/optimization/parallelization.py | 17 +- .../dataflow_graph_builder.py | 3 - .../generator/build_compiled_method_string.py | 19 - .../frontend/generator/generate_dataflow.py | 59 ---- .../generator/generate_split_functions.py | 334 ++++-------------- .../dataflow_analysis/test_entities.py | 26 +- .../dataflow_analysis/test_split_functions.py | 53 +-- 9 files changed, 141 insertions(+), 390 deletions(-) delete mode 100644 src/cascade/frontend/generator/build_compiled_method_string.py delete mode 100644 src/cascade/frontend/generator/generate_dataflow.py diff --git a/src/cascade/core.py b/src/cascade/core.py index 7294ae2..98e34ad 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -67,9 +67,11 @@ def init(): for method in cls.class_desc.methods_dec: df_ref = DataflowRef(op_name, method.method_name) # Add version number manually + print(df_ref) args = [f"{str(arg)}_0" for arg in method.method_node.args.args] + print(args) # TODO: cleaner solution that checks if the function is stateful or not - if args[0] == "self_0": + if len(args) > 0 and args[0] == "self_0": args = args[1:] dataflows[df_ref] = DataFlow(method.method_name, op_name, args) @@ -88,13 +90,19 @@ def init(): df.entry = [n0] blocks = [] else: - df, blocks = GroupStatements(method.method_node).build(dataflows, op_name) + df = GroupStatements(method.method_node).build(dataflows, op_name) op.dataflows[df.name] = df - for b in blocks: - op.methods[b.name] = b + for name, b in df.blocks.items(): + op.methods[name] = b +def get_operator(op_name: str): + return operators[op_name] + +def get_dataflow(ref: DataflowRef): + return dataflows[ref] + def clear(): registered_classes.clear() diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 15ca0a7..92c5b41 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING import uuid +import cascade + if TYPE_CHECKING: from cascade.frontend.generator.split_function import LocalBlock from cascade.dataflow.operator import Operator @@ -83,7 +85,7 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['E new_key = event.variable_map[self.keyby] else: new_key = None - df = self.dataflow.get_dataflow() + df = cascade.core.get_dataflow(self.dataflow) new_targets = df.entry if not isinstance(new_targets, list): new_targets = [new_targets] diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 0b26c6f..444fc63 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -183,13 +183,13 @@ from dataclasses import dataclass from typing import Any from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, Edge, Node - +import cascade @dataclass class AnnotatedNode: node: Node - reads: list[str] - writes: list[str] + reads: set[str] + writes: set[str] import networkx as nx @@ -201,12 +201,13 @@ def parallelize(df: DataFlow): graph = nx.DiGraph() for node in df.nodes.values(): if isinstance(node, CallEntity): - reads = list(node.variable_rename.values()) - writes = [result] if (result := node.assign_result_to) else [] + reads = set(node.variable_rename.values()) + writes = {result} if (result := node.assign_result_to) else set() elif isinstance(node, CallLocal): - method = df.get_operator().methods[node.method.method_name] - reads = method.var_map_reads - writes = method.var_map_writes + operator = cascade.core.operators[df.op_name] + method = df.blocks[node.method.method_name] + reads = method.reads + writes = method.writes else: raise ValueError(f"unsupported node type: {type(node)}") diff --git a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py index 5cf3f22..fd4d8d2 100644 --- a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py +++ b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py @@ -1,6 +1,3 @@ -import networkx as nx - - from klara.core.cfg import ModuleLabel, TempAssignBlock from klara.core import nodes diff --git a/src/cascade/frontend/generator/build_compiled_method_string.py b/src/cascade/frontend/generator/build_compiled_method_string.py deleted file mode 100644 index 2cd709f..0000000 --- a/src/cascade/frontend/generator/build_compiled_method_string.py +++ /dev/null @@ -1,19 +0,0 @@ -from cascade.frontend.generator.split_function import SplitFunction - - -class BuildCompiledMethodsString: - - def __init__(self, splits: list[SplitFunction]): - self.splits: list[SplitFunction] = splits - - def make_splitfunctions(self) -> list[str]: - bodies = [] - for split in self.splits: - body = split.to_string() - bodies.append(body) - return '\n\n'.join(bodies) - - @classmethod - def build(cls, splits: list[SplitFunction]): - cls = cls(splits) - return cls.make_splitfunctions() diff --git a/src/cascade/frontend/generator/generate_dataflow.py b/src/cascade/frontend/generator/generate_dataflow.py deleted file mode 100644 index 1a30b72..0000000 --- a/src/cascade/frontend/generator/generate_dataflow.py +++ /dev/null @@ -1,59 +0,0 @@ -# from cascade.frontend.generator.split_function import LocalBlock, SplitFunction -# from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod, Edge - - -# class GenerateDataflow: -# """ Generates dataflow -# """ - -# def __init__(self, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args): -# #TODO: add buildcontext that contains class name and target method -# self.split_functions = split_functions -# self.df = DataFlow(method_name, op_name, args) -# self.blocks: list[LocalBlock] = [] -# self.instance_type_map = instance_type_map - -# def generate_dataflow(self): -# self.extract_remote_method_calls() -# self.build_dataflow() - -# def build_dataflow(self): -# """ Every remote function invocation should add the node -# """ -# nodes = [] -# for split in self.split_functions: -# node = CallLocal(InvokeMethod(split.method_name)) -# self.df.add_node(node) -# nodes.append([node]) - -# if split.remote_calls: -# # TODO: instance_name -> correct entity (maybe using buildcontext/ instance type map) -# next_nodes = [] -# for remote in split.remote_calls: -# df = DataflowRef(self.instance_type_map[remote.instance_name], remote.attribute) -# args = df.get_dataflow.args -# # TODO: proper variable renaming -# vars = {arg: arg for arg in args} -# call = CallEntity(df, vars, assign_result_to=remote.target) -# next_nodes.append(call) -# nodes.append(next_nodes) - -# self.df.entry = nodes[0][0] -# for i in range(len(nodes)-1): -# # TODO: add merge nodes -# prev_nodes = nodes[i] -# next_nodes = nodes[i+1] -# for n in prev_nodes: -# for v in next_nodes: -# # TODO: Add variable map (think that should be the aggregation of the targets) -# self.df.add_edge(Edge(n, v)) - -# def extract_remote_method_calls(self): -# for split in self.split_functions: -# split.extract_remote_method_calls() - -# @classmethod -# def generate(cls, split_functions: list[SplitFunction], instance_type_map: dict[str, str], method_name, op_name, args) -> tuple[DataFlow, list[LocalBlock]]: -# c = cls(split_functions, instance_type_map, method_name, op_name, args) -# c.generate_dataflow() -# return c.df, c.blocks \ No newline at end of file diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/generate_split_functions.py index a9a278e..2146d5d 100644 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ b/src/cascade/frontend/generator/generate_split_functions.py @@ -1,108 +1,74 @@ -from itertools import count -from typing import List, Type - import networkx as nx -from cascade.dataflow.dataflow import DataFlow, DataflowRef, Edge, IfNode, Node +from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph -from cascade.frontend.generator.split_function import SplitFunction, LocalBlock, to_entity_call +from cascade.frontend.generator.split_function import LocalBlock, to_entity_call from klara.core import nodes -class GenerateSplitFunctions: +def split_statements_once(statements: list[Statement]) -> tuple[list[Statement], list[Statement]]: + """ + Split a list of statements, by grouping together statements that are not remote calls. - def __init__(self, dataflow_graph: ControlFlowGraph, class_name: str, entity_map: dict[str, str]): - self.dataflow_graph: ControlFlowGraph = dataflow_graph - self.class_name: str = class_name - self.entity_map: dict[str, str] = entity_map # {"instance_name": "EntityType"} - self.dataflow_node_map = dict() - self.counter = count() - self.split_functions = [] - - def generate_split_functions(self): - G = self.dataflow_graph.graph - entry_node: Statement = next(iter(G.nodes)) - assert type(entry_node.block) == nodes.FunctionDef - # targets = copy.copy(entry_node.targets) - continuation = list(G.nodes) - while self.invokes_remote_entity(continuation): - first_half, continuation = self.split_function(G) - self.add_split_function(first_half) - G = G.subgraph(continuation) - # TODO: Add a new source node to continuation - self.add_split_function(continuation) - - def add_split_function(self, statements: list[Statement]): - targets, values = set(), set() - for s in statements: - targets.update(repr(v) for v in s.targets) - if s.is_remote() or type(s.block) != nodes.FunctionDef: - values.update(repr(v) for v in s.values if not self.value_is_entity(v)) - i: int = next(self.counter) - method_name = f'{self.dataflow_graph.name}_{i}' - split_f: SplitFunction = SplitFunction(i, method_name, statements, targets=targets, values=values, class_name=self.class_name) - self.split_functions.append(split_f) - - def value_is_entity(self, value: nodes.Name) -> bool: - return value.id in self.entity_map - - def invokes_remote_entity(self, statments: list[Statement]) -> bool: - """Returns whether statements contains a remote invocation""" - return any(s.is_remote() for s in statments) - - def split_function(self, G: nx.DiGraph) -> tuple[list[Statement], list[Statement]]: - """ Produces split functions. Assumes that the runtime will always return to initial function call. - Therefore functions containing a remote function call (one to a remote entity) will be split into two functions: - one function adding the keys to the stack of the remote entities to call. And the continuation which the - function returns to. This way the entity invoking the method does not know anything about - - Assumes needs split. i.e. there is a remote entity invoked. - - Every node on the path to a node included should be included. (because these are the data dependencies) - - And also the nodes that the nodes listed above are data dependend on. - - Should also contain a liveness analyses to determine which variables should be passed on to the continuation. - """ - source: Statement = self.dataflow_graph.get_source_node() - first_half = [] # A set of nodes that are in the first half of the split function. - for n in G.nodes: - n: Statement - if n == source or not n.is_remote(): - continue - elif self.no_remote_dependencies_on_path(G, source, n): - self.add_nodes_path_to_first_half(G, source, n, first_half) - fh_set = set(first_half) - continuation = [] - for node in G.nodes: - if node not in fh_set: - continuation.append(node) - return first_half, continuation + As an example, suppose r and s are both statements, where r is a remote call and s is not. + Here is how the list gets split: + [r, s, s, r, s] -> [r] + [s, s, r, s] + [s, s, r, s, s] -> [s, s] + [r, s, s] + [s, s, s] -> [s, s, s] + [] + """ + assert len(statements) > 0 + + if statements[0].is_remote(): + return [statements[0]], statements[1:] - def no_remote_dependencies_on_path(self, G: nx.DiGraph, source: Statement, target: Statement) -> bool: - for path in self.get_all_simple_paths(G, source, target): - for n in path: - if n not in [source, target] and n.is_remote(): - return False - return True - - def add_nodes_path_to_first_half(self, G: nx.DiGraph, source: Statement, statement: Statement, split: list[Statement]): - for path in self.get_all_simple_paths(G, source, statement): - for n in path: - split.append(n) - - def get_all_simple_paths(self, G: nx.DiGraph, source: Statement, target: Statement): - return nx.all_simple_paths(G, source=source, target=target) + # find the next remote call + i = 0 + first_half = [] + while i < len(statements) and not statements[i].is_remote(): + first_half.append(statements[i]) + i += 1 + + continuation = statements[i:] + return first_half, continuation - @classmethod - def generate(cls, dataflow_graph: ControlFlowGraph, class_name: str, entity_map: dict[str, str]): - c = cls(dataflow_graph, class_name, entity_map) - c.generate_split_functions() - return c.split_functions +def split_statements(statements: list[Statement]) -> list[tuple[Statement,...]]: + grouped_statements = [] + continuation = statements + while len(continuation) > 0: + first_half, continuation = split_statements_once(continuation) + grouped_statements.append(tuple(first_half)) + return grouped_statements def split_cfg(blocked_statement_graph: nx.DiGraph) -> nx.DiGraph: - pass + split_graph: nx.DiGraph = blocked_statement_graph.copy() + for node in list(split_graph.nodes): + in_nodes = split_graph.predecessors(node) + out_nodes = split_graph.successors(node) + + # create the new nodes + new_nodes = split_statements(list(node)) + split_graph.remove_node(node) + split_graph.add_nodes_from(new_nodes) + + # connect the inner edges + u = new_nodes[0] + for v in new_nodes[1:]: + split_graph.add_edge(u, v) + u = v + + # connect the outer edges + for u in in_nodes: + split_graph.add_edge(u, new_nodes[0]) + for v in out_nodes: + split_graph.add_edge(new_nodes[-1], v) + + return split_graph + def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: """Transform a cfg (digraph of Statements) into a blocked version, i.e. a @@ -160,7 +126,7 @@ def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: last_node = tuple(grouped_statements) graph.add_node(last_node) return graph - if len(succ) == 2: + elif len(succ) == 2: if len(grouped_statements) > 1: before_if, last_node = tuple(grouped_statements[:-1]), tuple([grouped_statements[-1]]) graph.add_edge(before_if, last_node) @@ -199,57 +165,53 @@ def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: graph.add_edge(last_orelse, first_finally) return graph else: - raise ValueError(succ) + raise ValueError(f"We expect a CFG node to have max 2 successors, got {succ}") class GroupStatements: - - # todo: cfg should be control flow graph, statements should also be a graph - # list only works for functions with no control flow - # instead, generate_grouped should take a list of nodes, where each node is a stament, - # and create a graph of nodes where each node is a list of statments - # thus statements are grouped if they are all local and in the same block of control flow def __init__(self, function_def: nodes.FunctionDef): self.function_def = function_def + self.name = self.function_def.name + def build_cfg(self): cfg: ControlFlowGraph = ControlFlowGraphBuilder.build([self.function_def] + self.function_def.body) self.type_map = ExtractTypeVisitor.extract(self.function_def) cfg.name = self.function_def.name - for n in cfg.get_nodes(): - print(n) - # statements = list(cfg.get_nodes()) - # statements.sort(key=lambda s: s.block_num) - # self.statements = statements # TODO: for more complex control flow, use CFG structure instead - self._grouped_statements: List[List[Statement]] = [] + + entry_node: Statement = cfg.get_source_nodes()[0] + assert type(entry_node.block) == nodes.FunctionDef + cfg.remove_node(entry_node) self.cfg = cfg - self.blocked_cfg = blocked_cfg(cfg.graph, cfg.get_single_source()) + + self.blocked_cfg = split_cfg(blocked_cfg(cfg.graph, cfg.get_single_source())) def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: - entry_node: Statement = self.cfg.get_source_nodes()[0] - assert type(entry_node.block) == nodes.FunctionDef - self.cfg.remove_node(entry_node) - - df_ref = DataflowRef(op_name, self.cfg.name) + df_ref = DataflowRef(op_name, self.name) df = dataflows[df_ref] node_id_map = {} block_num = 0 + is_entry = True for statement_block in self.blocked_cfg.nodes: if len(statement_block) == 1 and statement_block[0].is_remote(): node = to_entity_call(statement_block[0], self.type_map, dataflows) elif len(statement_block) == 1 and statement_block[0].is_predicate: node = IfNode() else: - block = LocalBlock(list(statement_block), self.cfg.name, block_num, op_name) + block = LocalBlock(list(statement_block), self.name, block_num, op_name) block_num += 1 node = block.to_node() df.add_block(block) node_id_map[statement_block] = node.id df.add_node(node) + if is_entry: + df.entry = [node] + is_entry = False + for source, target, if_result in self.blocked_cfg.edges.data('type', default=None): source_id = node_id_map[source] target_id = node_id_map[target] @@ -257,155 +219,9 @@ def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> Data return df - def build_df_old(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: - entry_node: Statement = self.cfg.get_source_nodes()[0] - assert type(entry_node.block) == nodes.FunctionDef - - df_ref = DataflowRef(op_name, self.cfg.name) - df = dataflows[df_ref] - - last_node = None - block_num = 0 - - while len(self.cfg.graph) > 0: - print(df.to_dot()) - - source = self.cfg.get_single_source() - if source is not None and source.is_predicate: - node = IfNode() - self.cfg.remove_node(source) - else: - group = self.split_graph(self.cfg) - - if len(group) == 1 and group[0].is_remote(): - # Entity call - node = to_entity_call(group[0], self.type_map, dataflows) - else: - # Group statements together, into a block - block = LocalBlock(group, self.cfg.name, block_num, op_name) - block_num += 1 - node = block.to_node() - print(block.to_string()) - df.blocks[block.get_method_name()] = block - - if last_node == None: - last_node = node - df.add_node(node) - df.entry = [node] - else: - df.add_edge(Edge(last_node, node)) - last_node = node - - return df - - def split_graph(self, graph: ControlFlowGraph) -> list[Statement]: - if len(graph.graph) == 0: - return [] - - source = graph.get_source_nodes()[0] - if source.is_remote(): - graph.remove_node(source) - return [source] - - # find the next remote call - local_group = [source] - node = graph.get_single_successor(source) - graph.remove_node(source) - - while node is not None and not node.is_remote() and not node.is_predicate: - if len(list(graph.graph.predecessors(node))) > 1: - break - local_group.append(node) - succ = graph.get_single_successor(node) - graph.remove_node(node) - node = succ - - return local_group - - """ - Split a list of statements, by grouping together statements that are not remote calls. - The graph becomes a subgraph, with the statments removed. - - As an example, suppose r and s are both statements, where r is a remote call and s is not. - - Here is how the list gets split: - [r, s, s, r, s] -> [r] + [s, s, r, s] - [r, r, s, r, s] -> [r] + [r, s, r, s] - [s, s, r, s, s] -> [s, s] + [r, s, s] - [s, r, r, s, s] -> [s] + [r, r, s, s] - [s, s, s] -> [s, s, s] + [] - """ - - # def generate_grouped_statements(self) -> List[List[Statement]]: - # entry_node: Statement = self.statements[0] - # assert type(entry_node.block) == nodes.FunctionDef - - # grouped_statements = [] - # continuation = self.statements[1:] - # while len(continuation) > 0: - # first_half, continuation = self.split_statements(continuation) - # grouped_statements.append(first_half) - - # self._grouped_statements = grouped_statements - # return grouped_statements - - # def split_statements(self, statements: list[Statement]) -> tuple[list[Statement], list[Statement]]: - # """ - # Split a list of statements, by grouping together statements that are not remote calls. - - # As an example, suppose r and s are both statements, where r is a remote call and s is not. - - # Here is how the list gets split: - # [r, s, s, r, s] -> [r] + [s, s, r, s] - # [s, s, r, s, s] -> [s, s] + [r, s, s] - # [s, s, s] -> [s, s, s] + [] - # """ - # assert len(statements) > 0 - - # if statements[0].is_remote(): - # return [statements[0]], statements[1:] - - # # find the next remote call - # i = 0 - # first_half = [] - # while i < len(statements) and not statements[i].is_remote(): - # first_half.append(statements[i]) - # i += 1 - - # continuation = statements[i:] - # return first_half, continuation - # def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> tuple[DataFlow, List[LocalBlock]]: - # self.build_cfg() + def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: + self.build_cfg() - # self.generate_grouped_statements() - - # blocks = [] - # block_num = 0 - - # df_ref = DataflowRef(op_name, self.cfg.name) - # df = dataflows[df_ref] - - # last_node = None - # for split in self._grouped_statements: - # if len(split) == 1 and split[0].is_remote(): - # # Entity call - # node = to_entity_call(split[0], self.type_map, dataflows) - # else: - # # Group statements together, into a block - # block = LocalBlock(split, self.cfg.name, block_num, op_name) - # block_num += 1 - # node = block.to_node() - # blocks.append(block) - - - # if last_node == None: - # last_node = node - # df.add_node(node) - # df.entry = [node] - # else: - # df.add_edge(Edge(last_node, node)) - # last_node = node - - # return df, blocks + return self.build_df(dataflows, op_name) diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 2495e3c..a8f84cf 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -31,7 +31,7 @@ def get_total(item1: Stock, item2: Stock): DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Stock", []) } - df, blocks = sf.build(dataflows, "Test") + df = sf.build(dataflows, "Test") ## TODO: check blocks/df assert len(df.nodes) == 3 @@ -65,10 +65,10 @@ def add(x: int, y: int): DataflowRef("Test", "add"): DataFlow("get_total", "Test", ["x", "y"]), } - df, blocks = sf.build(dataflows, "Test") + df = sf.build(dataflows, "Test") - assert len(blocks) == 1 - assert blocks[0].call({"x_0": 3, "y_0":5 }, None) == 8 + assert len(df.blocks) == 1 + assert list(df.blocks.values())[0].call_block({"x_0": 3, "y_0":5 }, None) == 8 def test_state(): @@ -92,19 +92,19 @@ def buy_item(self, item: 'Item') -> bool: DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), } - df, blocks = sf.build(dataflows, "User") + df = sf.build(dataflows, "User") + + blocks = list(df.blocks.values()) assert len(blocks) == 1 - func = blocks[0].call - print(blocks[0].raw_method_string) + func = blocks[0].call_block + print(blocks[0].to_string()) @dataclass class User: username: str balance: int - func = blocks[0].call - user = User("a", 20) func({"item_price_0": 10}, user.__dict__) assert user.balance == 10 @@ -131,8 +131,10 @@ def upload_unique_id(self, review_id: int): DataflowRef("ComposeReview", "__init__"): DataFlow("__init__", "ComposeReview", ["req_id"]), } - df, blocks = sf.build(dataflows, "ComposeReview") + df = sf.build(dataflows, "ComposeReview") + + blocks = list(df.blocks.values()) assert len(blocks) == 1 @@ -141,9 +143,9 @@ class ComposeReview: req_id: str review_data: dict - func = blocks[0].call + func = blocks[0].call_block - print(blocks[0].raw_method_string) + print(blocks[0].to_string()) compose_review = ComposeReview("req", {}) func({"review_id_0": 123}, compose_review.__dict__) diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index ad5659d..0255d1c 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -6,7 +6,7 @@ from klara.core import nodes from cascade.dataflow.dataflow import DataFlow, DataflowRef -from cascade.frontend.generator.generate_split_functions import GroupStatements, blocked_cfg +from cascade.frontend.generator.generate_split_functions import GroupStatements, blocked_cfg, split_cfg from cascade.frontend.intermediate_representation.control_flow_graph import ControlFlowGraph from cascade.frontend.util import setup_cfg @@ -39,13 +39,16 @@ def get_total(item1: Stock, item2: Stock, y: int): } - # TODO: Check # entity calls, # of blocks, # of local calls df = sf.build_df(dataflows, "Test") print(df.to_dot()) for block in df.blocks.values(): print(block.to_string()) + # TODO: Check # entity calls, # of local calls + assert len(df.nodes) == 5 + assert len(df.blocks) == 2 + def test_branching(): program: str = dedent(""" class Test: @@ -68,17 +71,8 @@ def test_branching(self) -> int: sf.build_cfg() print(sf.cfg.to_dot()) new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) - for node in new.nodes: - for s in node: - print(s.block_num, end=" ") - print() - for edge in new.edges: - for s in edge[0]: - print(s.block_num, end=" ") - print("->", end= " ") - for s in edge[1]: - print(s.block_num, end=" ") - print() + + assert len(new.nodes) == 5 dataflows = { DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []) @@ -89,8 +83,21 @@ def test_branching(self) -> int: print(df.to_dot()) for block in df.blocks.values(): print(block.to_string()) - assert len(df.blocks) == 4 assert len(df.nodes) == 5 + assert len(df.blocks) == 4 + +def print_digraph(graph): + for node in graph.nodes: + for s in node: + print(s.block_num, end=" ") + print() + for edge in graph.edges: + for s in edge[0]: + print(s.block_num, end=" ") + print("->", end= " ") + for s in edge[1]: + print(s.block_num, end=" ") + print() def test_branching_with_entity_calls(): program: str = dedent(""" @@ -117,17 +124,10 @@ def test_branching(self) -> int: sf.build_cfg() print(sf.cfg.to_dot()) new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) - for node in new.nodes: - for s in node: - print(s.block_num, end=" ") - print() - for edge in new.edges: - for s in edge[0]: - print(s.block_num, end=" ") - print("->", end= " ") - for s in edge[1]: - print(s.block_num, end=" ") - print() + + assert len(list(new.nodes)) == 5 + new_split = split_cfg(new) + assert len(list(new_split.nodes)) == 7 dataflows = { DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []), @@ -142,5 +142,8 @@ def test_branching(self) -> int: for block in df.blocks.values(): print(block.to_string()) + assert len(df.nodes) == 7 + assert len(df.blocks) == 5 + def test_block_merging(): raise NotImplementedError() \ No newline at end of file From 5542b4c96cfddc0d36ad348d5a163e6c2629b299 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Mon, 7 Apr 2025 12:36:06 +0200 Subject: [PATCH 20/37] Renaming --- notebooks/dataflow_example.ipynb | 4 +- src/cascade/core.py | 4 +- src/cascade/dataflow/dataflow.py | 11 +-- src/cascade/dataflow/operator.py | 24 +------ src/cascade/descriptors/method_descriptor.py | 4 +- src/cascade/frontend/cfg/__init__.py | 2 + .../cfg_builder.py} | 2 +- .../control_flow_graph.py | 2 +- .../statement.py | 0 .../frontend/dataflow_analysis/__init__.py | 0 ...split_functions.py => dataflow_builder.py} | 8 +-- .../{split_function.py => local_block.py} | 70 +------------------ src/cascade/frontend/generator/remote_call.py | 7 -- .../intermediate_representation/__init__.py | 2 - .../test_dataflow_graph_builder.py | 4 +- .../dataflow_analysis/test_entities.py | 10 +-- .../dataflow_analysis/test_split_functions.py | 10 +-- 17 files changed, 29 insertions(+), 135 deletions(-) create mode 100644 src/cascade/frontend/cfg/__init__.py rename src/cascade/frontend/{dataflow_analysis/dataflow_graph_builder.py => cfg/cfg_builder.py} (96%) rename src/cascade/frontend/{intermediate_representation => cfg}/control_flow_graph.py (97%) rename src/cascade/frontend/{intermediate_representation => cfg}/statement.py (100%) delete mode 100644 src/cascade/frontend/dataflow_analysis/__init__.py rename src/cascade/frontend/generator/{generate_split_functions.py => dataflow_builder.py} (96%) rename src/cascade/frontend/generator/{split_function.py => local_block.py} (62%) delete mode 100644 src/cascade/frontend/generator/remote_call.py delete mode 100644 src/cascade/frontend/intermediate_representation/__init__.py diff --git a/notebooks/dataflow_example.ipynb b/notebooks/dataflow_example.ipynb index 7649417..1fd2525 100644 --- a/notebooks/dataflow_example.ipynb +++ b/notebooks/dataflow_example.ipynb @@ -42,7 +42,7 @@ "from cascade.frontend.dataflow_analysis.class_list_builder import ClassListBuilder\n", "from cascade.frontend.dataflow_analysis.class_wrapper import ClassWrapper\n", "from cascade.frontend.util import setup_cfg, plot_graph_with_color, plot_dataflow_graph\n", - "from cascade.frontend.intermediate_representation import ControlFlowGraph, DataflowGraph" + "from cascade.frontend.cfg import ControlFlowGraph, DataflowGraph" ] }, { @@ -319,7 +319,7 @@ "source": [ "from textwrap import indent\n", "from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions\n", - "from cascade.frontend.intermediate_representation import Block\n", + "from cascade.frontend.cfg import Block\n", "\n", "compiled_functions, df = GenerateSplittFunctions.generate_split_function_string(block_level_dataflow_graph)" ] diff --git a/src/cascade/core.py b/src/cascade/core.py index 98e34ad..2d98076 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -8,7 +8,7 @@ from cascade.dataflow.operator import StatefulOperator, StatelessOperator, Operator from cascade.wrappers import ClassWrapper from cascade.descriptors import ClassDescriptor -from cascade.frontend.generator.generate_split_functions import GroupStatements +from cascade.frontend.generator.dataflow_builder import DataflowBuilder from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass @@ -90,7 +90,7 @@ def init(): df.entry = [n0] blocks = [] else: - df = GroupStatements(method.method_node).build(dataflows, op_name) + df = DataflowBuilder(method.method_node).build(dataflows, op_name) op.dataflows[df.name] = df for name, b in df.blocks.items(): diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 92c5b41..3737336 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -7,7 +7,7 @@ import cascade if TYPE_CHECKING: - from cascade.frontend.generator.split_function import LocalBlock + from cascade.frontend.generator.local_block import LocalBlock from cascade.dataflow.operator import Operator @@ -320,15 +320,6 @@ def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None def __str__(self) -> str: return f"{self.op_name}.{self.name}" - -@dataclass -class CollectTarget: - target_node: CollectNode - """Target node""" - total_items: int - """How many items the merge node needs to wait on (including this one).""" - result_idx: int - """The index this result should be in the collected array.""" def metadata_dict() -> dict: return { diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index e9c0439..ecbe7fb 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -2,7 +2,7 @@ from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, TYPE_CHECKING if TYPE_CHECKING: - from cascade.frontend.generator.split_function import LocalBlock + from cascade.frontend.generator.local_block import LocalBlock from cascade.dataflow.dataflow import DataFlow, InvokeMethod T = TypeVar('T') @@ -39,28 +39,6 @@ def my_compiled_method(variable_map: dict[str, Any], state: T) -> Any def __call__(self, variable_map: dict[str, Any], state: T) -> Any: ... """@private""" -# @dataclass -# class LocalBlock: -# var_map_writes: set[str] -# var_map_reads: set[str] -# name: str -# statements: -# function_call: Union[MethodCall, 'StatelessMethodCall'] -# raw_method_string: str - -# def call(self, *args, **kwargs) -> Any: -# return self.function_call(*args, **kwargs) - -# def merge_with(self, other: 'LocalBlock'): -# self.var_map_writes.update(other.var_map_writes) -# self.var_map_reads.update(other.var_map_reads) - -# local_scope = {} -# raw_str = self.to_string() -# exec(self.to_string(), {}, local_scope) -# method_name = self.get_method_name() -# fn = local_scope[method_name] - class StatelessMethodCall(Protocol): def __call__(self, variable_map: dict[str, Any]) -> Any: ... diff --git a/src/cascade/descriptors/method_descriptor.py b/src/cascade/descriptors/method_descriptor.py index b61df7d..9c367f5 100644 --- a/src/cascade/descriptors/method_descriptor.py +++ b/src/cascade/descriptors/method_descriptor.py @@ -1,7 +1,7 @@ from klara.core import nodes -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder -from cascade.frontend.intermediate_representation import ControlFlowGraph +from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder +from cascade.frontend.cfg import ControlFlowGraph class MethodDescriptor: diff --git a/src/cascade/frontend/cfg/__init__.py b/src/cascade/frontend/cfg/__init__.py new file mode 100644 index 0000000..5da7d36 --- /dev/null +++ b/src/cascade/frontend/cfg/__init__.py @@ -0,0 +1,2 @@ +from .control_flow_graph import ControlFlowGraph +from .statement import Statement \ No newline at end of file diff --git a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py b/src/cascade/frontend/cfg/cfg_builder.py similarity index 96% rename from src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py rename to src/cascade/frontend/cfg/cfg_builder.py index fd4d8d2..390abd0 100644 --- a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py +++ b/src/cascade/frontend/cfg/cfg_builder.py @@ -1,7 +1,7 @@ from klara.core.cfg import ModuleLabel, TempAssignBlock from klara.core import nodes -from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph +from cascade.frontend.cfg import Statement, ControlFlowGraph from cascade.frontend.ast_visitors import ContainsAttributeVisitor, VariableGetter diff --git a/src/cascade/frontend/intermediate_representation/control_flow_graph.py b/src/cascade/frontend/cfg/control_flow_graph.py similarity index 97% rename from src/cascade/frontend/intermediate_representation/control_flow_graph.py rename to src/cascade/frontend/cfg/control_flow_graph.py index 296e258..060831c 100644 --- a/src/cascade/frontend/intermediate_representation/control_flow_graph.py +++ b/src/cascade/frontend/cfg/control_flow_graph.py @@ -3,7 +3,7 @@ import networkx as nx from cascade.frontend.generator.unparser import unparse -from cascade.frontend.intermediate_representation.statement import Statement +from cascade.frontend.cfg.statement import Statement @dataclass diff --git a/src/cascade/frontend/intermediate_representation/statement.py b/src/cascade/frontend/cfg/statement.py similarity index 100% rename from src/cascade/frontend/intermediate_representation/statement.py rename to src/cascade/frontend/cfg/statement.py diff --git a/src/cascade/frontend/dataflow_analysis/__init__.py b/src/cascade/frontend/dataflow_analysis/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/dataflow_builder.py similarity index 96% rename from src/cascade/frontend/generator/generate_split_functions.py rename to src/cascade/frontend/generator/dataflow_builder.py index 2146d5d..894606e 100644 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -2,9 +2,9 @@ from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder -from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph -from cascade.frontend.generator.split_function import LocalBlock, to_entity_call +from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder +from cascade.frontend.cfg import Statement, ControlFlowGraph +from cascade.frontend.generator.local_block import LocalBlock, to_entity_call from klara.core import nodes @@ -169,7 +169,7 @@ def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: -class GroupStatements: +class DataflowBuilder: def __init__(self, function_def: nodes.FunctionDef): self.function_def = function_def self.name = self.function_def.name diff --git a/src/cascade/frontend/generator/split_function.py b/src/cascade/frontend/generator/local_block.py similarity index 62% rename from src/cascade/frontend/generator/split_function.py rename to src/cascade/frontend/generator/local_block.py index e10587d..921726d 100644 --- a/src/cascade/frontend/generator/split_function.py +++ b/src/cascade/frontend/generator/local_block.py @@ -1,13 +1,10 @@ from textwrap import indent -from dataclasses import dataclass, field from typing import Any, Union, TYPE_CHECKING -from cascade.frontend.util import to_camel_case -from cascade.frontend.intermediate_representation import Statement +from cascade.frontend.cfg import Statement from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState from cascade.frontend.generator.unparser import unparse -from cascade.frontend.generator.remote_call import RemoteCall from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod from klara.core.cfg import RawBasicBlock @@ -16,71 +13,6 @@ if TYPE_CHECKING: from cascade.dataflow.operator import MethodCall, StatelessMethodCall -@dataclass -class SplitFunction: - method_number: int - method_name: str - method_body: list[Statement] - targets: set[str] = None - values: set[str] = None - class_name: str = None - remote_calls: list[RemoteCall] = field(default_factory=list) # {'assign_result_to_var': 'method_to_call'} - - def set_class_name(self, name: str): - self.class_name = name - - def to_string(self) -> str: - indent_prefix: str = ' ' * 4 # indent usting 4 spaces. - body: str = indent(self.body_to_string(), indent_prefix) - method_signature: str = self.get_method_signature() - compiled_method_as_string: str = f'def {self.method_name}_compiled({method_signature}) -> Any:\n{body}' - return compiled_method_as_string - - def get_method_signature(self) -> str: - return f'variable_map: dict[str, Any], state: {self.class_name}, key_stack: list[str]' - - def body_to_string(self) -> str: - body = [] - for v in sorted(self.values - self.targets): - if not (v in [ 'self_0','self']): - body.append(f'{v} = variable_map[\'{v}\']') - - for statement in self.method_body: - if statement.remote_call: - assert statement.attribute - attribute: nodes.Attribute = statement.attribute - value: nodes.Name = attribute.value - instance_name: str = value.id - res = f'key_stack.append(variable_map[\'{instance_name}_key\'])' - body.append(res) - else: - block: RawBasicBlock = statement.block - if type(block) == nodes.FunctionDef: - continue - ReplaceSelfWithState.replace(block) - - if type(block) == nodes.Return: - body.insert(0,'key_stack.pop()') - body.append(unparse(block)) - - if 'return' not in body[-1]: - body.append('return None') - return "\n".join(body) - - def extract_remote_method_calls(self): - for statement in self.method_body: - if statement.remote_call: - self.add_statement_to_remote_call_set(statement) - - def add_statement_to_remote_call_set(self, statement: Statement): - assert statement.attribute, "A remote call should have an attribute name to call" - attribute = statement.attribute - if len(statement.targets) > 1: - assert False, "A remote method invocation that returns multiple items is not supported yet..." - target, = statement.targets - remote_call: RemoteCall = RemoteCall(attribute.value.id, attribute.attr, target) - self.remote_calls.append(remote_call) - def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: dict[DataflowRef, DataFlow]) -> CallEntity: """Transform a remote statement to an entity call.""" diff --git a/src/cascade/frontend/generator/remote_call.py b/src/cascade/frontend/generator/remote_call.py deleted file mode 100644 index 63c7601..0000000 --- a/src/cascade/frontend/generator/remote_call.py +++ /dev/null @@ -1,7 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class RemoteCall: - instance_name: str - attribute: str - target: str \ No newline at end of file diff --git a/src/cascade/frontend/intermediate_representation/__init__.py b/src/cascade/frontend/intermediate_representation/__init__.py deleted file mode 100644 index 36d6352..0000000 --- a/src/cascade/frontend/intermediate_representation/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .statement import Statement -from .control_flow_graph import ControlFlowGraph \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index 0f9da5c..2c49883 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -3,8 +3,8 @@ from klara.core.cfg import Cfg from klara.core import nodes -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import ControlFlowGraphBuilder -from cascade.frontend.intermediate_representation import Statement, ControlFlowGraph +from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder +from cascade.frontend.cfg import Statement, ControlFlowGraph from cascade.frontend.util import setup_cfg diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index a8f84cf..4fbf785 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -7,7 +7,7 @@ from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef -from cascade.frontend.generator.generate_split_functions import GroupStatements +from cascade.frontend.generator.dataflow_builder import DataflowBuilder from cascade.frontend.util import setup_cfg def test_call_entity(): @@ -23,7 +23,7 @@ def get_total(item1: Stock, item2: Stock): test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(get_total) + sf = DataflowBuilder(get_total) sf.build_cfg() dataflows = { @@ -59,7 +59,7 @@ def add(x: int, y: int): test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(get_total) + sf = DataflowBuilder(get_total) dataflows = { DataflowRef("Test", "add"): DataFlow("get_total", "Test", ["x", "y"]), @@ -85,7 +85,7 @@ def buy_item(self, item: 'Item') -> bool: user_class: nodes.Block = blocks[2] buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(buy_item) + sf = DataflowBuilder(buy_item) dataflows = { DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), @@ -124,7 +124,7 @@ def upload_unique_id(self, review_id: int): user_class: nodes.Block = blocks[2] upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(upload_unique) + sf = DataflowBuilder(upload_unique) dataflows = { DataflowRef("ComposeReview", "upload_unique_id"): DataFlow("upload_unique_id", "ComposeReview", ["review_id"]), diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index 0255d1c..a41c489 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -6,8 +6,8 @@ from klara.core import nodes from cascade.dataflow.dataflow import DataFlow, DataflowRef -from cascade.frontend.generator.generate_split_functions import GroupStatements, blocked_cfg, split_cfg -from cascade.frontend.intermediate_representation.control_flow_graph import ControlFlowGraph +from cascade.frontend.generator.dataflow_builder import DataflowBuilder, blocked_cfg, split_cfg +from cascade.frontend.cfg.control_flow_graph import ControlFlowGraph from cascade.frontend.util import setup_cfg def test_entity_calls(): @@ -29,7 +29,7 @@ def get_total(item1: Stock, item2: Stock, y: int): get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(get_total) + sf = DataflowBuilder(get_total) sf.build_cfg() dataflows = { @@ -67,7 +67,7 @@ def test_branching(self) -> int: test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(get_total) + sf = DataflowBuilder(get_total) sf.build_cfg() print(sf.cfg.to_dot()) new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) @@ -120,7 +120,7 @@ def test_branching(self) -> int: test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] - sf = GroupStatements(get_total) + sf = DataflowBuilder(get_total) sf.build_cfg() print(sf.cfg.to_dot()) new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) From 446ccd62caaf4ef1e202cedf66eacfffc0725cf4 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Mon, 7 Apr 2025 13:05:09 +0200 Subject: [PATCH 21/37] Use dataflow ref instead of dataflow on Event objects --- deathstar_movie_review/test_movie_review_demo.py | 2 +- src/cascade/dataflow/dataflow.py | 15 ++++++++------- src/cascade/runtime/python_runtime.py | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 0bfc7fb..dc5ea92 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -152,7 +152,7 @@ def deathstar_movie_demo(client): event = frontend_op.dataflows["compose_parallel"].generate_event(r_data) result = client.send(event, block=True) print(result) - print("review composed") + print("review composed (parallel)") event = compose_op.dataflows["get_data"].generate_event({"req_id": req_id}, req_id) result = client.send(event, block=True) diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 3737336..2516ad7 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -53,8 +53,8 @@ class DataflowRef: operator_name: str dataflow_name: str - # def get_dataflow(self) -> 'DataFlow': - # return cascade_core.dataflows[self] + def get_dataflow(self) -> 'DataFlow': + return cascade.core.dataflows[self] def __repr__(self) -> str: return f"{self.operator_name}.{self.dataflow_name}" @@ -104,7 +104,7 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['E return [Event( target, new_var_map, - df, + self.dataflow, _id=event._id, metadata=event.metadata, call_stack=event.call_stack, @@ -192,6 +192,7 @@ def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.nodes: dict[int, Node] = {} self.entry: List[Node] = [] self.op_name = op_name + self.ref = DataflowRef(op_name, name) if args: self.args: list[str] = args else: @@ -303,9 +304,9 @@ def to_dot(self) -> str: def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> list['Event']: assert len(self.entry) != 0 # give all the events the same id - first_event = Event(self.entry[0], variable_map, self, key=key) + first_event = Event(self.entry[0], variable_map, self.ref, key=key) id = first_event._id - events = [first_event] + [Event(entry, variable_map, self, _id=id, key=key) for entry in self.entry[1:]] + events = [first_event] + [Event(entry, variable_map, self.ref, _id=id, key=key) for entry in self.entry[1:]] # TODO: propogate at "compile time" instead of doing this every time local_events = [] @@ -330,7 +331,7 @@ def metadata_dict() -> dict: @dataclass class CallStackItem: - dataflow: DataFlow + dataflow: DataflowRef assign_result_to: Optional[str] var_map: dict[str, str] """Variables are saved in the call stack""" @@ -373,7 +374,7 @@ def __post_init__(self): def propogate(self, result: Any) -> Iterable[Union['EventResult', 'Event']]: """Propogate this event through the Dataflow.""" - targets = self.dataflow.get_neighbors(self.target) + targets = self.dataflow.get_dataflow().get_neighbors(self.target) events = [] diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index 6efbd90..0e48199 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -112,10 +112,10 @@ def _run(self): self.running = True def consume_event(event: Event): if isinstance(event.target, CallLocal): - if event.dataflow.op_name in self.statefuloperators: - yield from self.statefuloperators[event.dataflow.op_name].process(event) + if event.dataflow.operator_name in self.statefuloperators: + yield from self.statefuloperators[event.dataflow.operator_name].process(event) else: - yield from self.statelessoperators[event.dataflow.op_name].process(event) + yield from self.statelessoperators[event.dataflow.operator_name].process(event) elif isinstance(event.target, CallEntity): new_events = event.propogate(None) if isinstance(new_events, EventResult): From 1bf33696e0283e934329a92885b3991e6590811d Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Mon, 7 Apr 2025 16:07:49 +0200 Subject: [PATCH 22/37] Fix integration tests --- .../test_movie_review_demo.py | 35 ++++--- src/cascade/core.py | 7 +- src/cascade/dataflow/dataflow.py | 7 +- src/cascade/dataflow/operator.py | 12 ++- src/cascade/frontend/generator/local_block.py | 45 ++++++--- src/cascade/frontend/generator/unparser.py | 2 +- src/cascade/runtime/flink_runtime.py | 10 +- src/cascade/runtime/python_runtime.py | 2 +- .../dataflow_analysis/test_entities.py | 6 +- .../flink/test_collect_operator.py | 35 ++++--- tests/integration/flink/test_operators.py | 97 +++++++++++++++++++ .../flink/test_stateful_operators.py | 63 ------------ tests/integration/pyruntime/test_programs.py | 44 ++++++--- tests/integration/stateless.py | 8 ++ tests/optimizations/test_parallelize.py | 22 +++-- 15 files changed, 242 insertions(+), 153 deletions(-) create mode 100644 tests/integration/flink/test_operators.py delete mode 100644 tests/integration/flink/test_stateful_operators.py create mode 100644 tests/integration/stateless.py diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index dc5ea92..b38ab6f 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -5,6 +5,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) +from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime @@ -55,13 +56,9 @@ def deathstar_movie_demo(client): compose_op = cascade.core.operators["ComposeReview"] movie_op = cascade.core.operators["MovieId"] frontend_op = cascade.core.operators["Frontend"] - df = parallelize(frontend_op.dataflows["compose"]) - df.name = "compose_parallel" - frontend_op.dataflows["compose_parallel"] = df - print(frontend_op.dataflows["compose_parallel"].to_dot()) - print(frontend_op.dataflows) - assert len(frontend_op.dataflows["compose_parallel"].entry) == 4 + compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + for df in cascade.core.dataflows.values(): print(df.to_dot()) @@ -77,7 +74,7 @@ def deathstar_movie_demo(client): print("testing user create") - event = user_op.dataflows["__init__"].generate_event({"username": username, "user_data": user_data}, username) + event = cascade.core.dataflows[DataflowRef("User", "__init__")].generate_event({"username": username, "user_data": user_data}, username) result = client.send(event, block=True) print(result) assert result.username == username @@ -88,13 +85,12 @@ def deathstar_movie_demo(client): movie_id = 1 # make the review - event = compose_op.dataflows["__init__"].generate_event({"req_id": req_id}, req_id) + event = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")].generate_event({"req_id": req_id}, req_id) result = client.send(event, block=True) print("review made") # # make the movie - # init_movie = OpNode(MovieId, InitClass(), read_key_from="title") - event = movie_op.dataflows["__init__"].generate_event({"title": movie_title, "movie_id": movie_id}, movie_title) + event = cascade.core.dataflows[DataflowRef("MovieId", "__init__")].generate_event({"title": movie_title, "movie_id": movie_id}, movie_title) result = client.send(event, block=True) print("movie made") @@ -109,13 +105,13 @@ def deathstar_movie_demo(client): r_data = {r+"_0": v for r, v in review_data.items()} - event = frontend_op.dataflows["compose"].generate_event(r_data) + event = compose_df.generate_event(r_data) result = client.send(event, block=True) print(result) print("review composed") - event = compose_op.dataflows["get_data"].generate_event({"req_id": req_id}, req_id) + event = cascade.core.dataflows[DataflowRef("ComposeReview", "get_data")].generate_event({"req_id": req_id}, req_id) result = client.send(event, block=True) print(result) @@ -132,11 +128,18 @@ def deathstar_movie_demo(client): ## NOW DO IT PARALLEL! + df_parallel = parallelize(compose_df) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 4 + + # make the review new_req_id = "43" - event = compose_op.dataflows["__init__"].generate_event({"req_id": new_req_id}, new_req_id) + event = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")].generate_event({"req_id": new_req_id}, new_req_id) result = client.send(event, block=True) - print("review made") + print("review made (parallel)") # compose the review review_data = { @@ -149,12 +152,12 @@ def deathstar_movie_demo(client): r_data = {r+"_0": v for r, v in review_data.items()} - event = frontend_op.dataflows["compose_parallel"].generate_event(r_data) + event = df_parallel.generate_event(r_data) result = client.send(event, block=True) print(result) print("review composed (parallel)") - event = compose_op.dataflows["get_data"].generate_event({"req_id": req_id}, req_id) + event = cascade.core.dataflows[DataflowRef("ComposeReview", "get_data")].generate_event({"req_id": req_id}, req_id) result = client.send(event, block=True) print(result) diff --git a/src/cascade/core.py b/src/cascade/core.py index 2d98076..4e09ad3 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -67,9 +67,7 @@ def init(): for method in cls.class_desc.methods_dec: df_ref = DataflowRef(op_name, method.method_name) # Add version number manually - print(df_ref) args = [f"{str(arg)}_0" for arg in method.method_node.args.args] - print(args) # TODO: cleaner solution that checks if the function is stateful or not if len(args) > 0 and args[0] == "self_0": args = args[1:] @@ -92,9 +90,10 @@ def init(): else: df = DataflowBuilder(method.method_node).build(dataflows, op_name) - op.dataflows[df.name] = df + dataflows[df.ref()] = df + # op.dataflows[df.name] = df for name, b in df.blocks.items(): - op.methods[name] = b + op.methods[name] = b.compile() def get_operator(op_name: str): diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 2516ad7..bdcc9d5 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -192,13 +192,14 @@ def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.nodes: dict[int, Node] = {} self.entry: List[Node] = [] self.op_name = op_name - self.ref = DataflowRef(op_name, name) if args: self.args: list[str] = args else: self.args = [] self.blocks: dict[str, 'LocalBlock'] = {} + def ref(self) -> DataflowRef: + return DataflowRef(self.op_name, self.name) # def get_operator(self) -> Operator: # return cascade.core.operators[self.op_name] @@ -304,9 +305,9 @@ def to_dot(self) -> str: def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> list['Event']: assert len(self.entry) != 0 # give all the events the same id - first_event = Event(self.entry[0], variable_map, self.ref, key=key) + first_event = Event(self.entry[0], variable_map, self.ref(), key=key) id = first_event._id - events = [first_event] + [Event(entry, variable_map, self.ref, _id=id, key=key) for entry in self.entry[1:]] + events = [first_event] + [Event(entry, variable_map, self.ref(), _id=id, key=key) for entry in self.entry[1:]] # TODO: propogate at "compile time" instead of doing this every time local_events = [] diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index ecbe7fb..f6c8580 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -2,14 +2,14 @@ from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, TYPE_CHECKING if TYPE_CHECKING: - from cascade.frontend.generator.local_block import LocalBlock + from cascade.frontend.generator.local_block import CompiledLocalBlock from cascade.dataflow.dataflow import DataFlow, InvokeMethod T = TypeVar('T') class Operator(ABC): dataflows: dict[str, 'DataFlow'] - methods: Mapping[str, 'LocalBlock'] + methods: Mapping[str, 'CompiledLocalBlock'] @abstractmethod def name(self) -> str: @@ -57,7 +57,7 @@ class StatefulOperator(Generic[T], Operator): methods, instead reading and modifying the underlying class `T` through a state variable, see `handle_invoke_method`. """ - def __init__(self, entity: Type[T], methods: dict[str, 'LocalBlock'], dataflows: dict[str, 'DataFlow']): + def __init__(self, entity: Type[T], methods: dict[str, 'CompiledLocalBlock'], dataflows: dict[str, 'DataFlow']): """Create the StatefulOperator from a class and its compiled methods. Typically, a class could be comprised of split and non-split methods. Take the following example: @@ -137,11 +137,12 @@ def name(self): class StatelessOperator(Operator): """A StatelessOperator refers to a stateless function and therefore only has one dataflow.""" - def __init__(self, entity: Type, methods: dict[str, 'LocalBlock'], dataflows: dict[str, 'DataFlow']): + def __init__(self, entity: Type, methods: dict[str, 'CompiledLocalBlock'], dataflows: dict[str, 'DataFlow']): self.entity = entity # TODO: extract this from dataflows.blocks self.methods = methods - self.dataflows = dataflows + # self.dataflows = dataflows + pass def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, Any]): """Invoke the method of the underlying class. @@ -157,6 +158,7 @@ def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) def name(self) -> str: + # return "SomeStatelessOp" return self.entity.__name__ diff --git a/src/cascade/frontend/generator/local_block.py b/src/cascade/frontend/generator/local_block.py index 921726d..d8617b1 100644 --- a/src/cascade/frontend/generator/local_block.py +++ b/src/cascade/frontend/generator/local_block.py @@ -1,5 +1,5 @@ from textwrap import indent -from typing import Any, Union, TYPE_CHECKING +from typing import Any, Callable, Union, TYPE_CHECKING from cascade.frontend.cfg import Statement @@ -77,23 +77,15 @@ def __init__(self, statements: list[Statement], method_base_name: str, block_num self.reads: set[str] = reads self.writes: set[str] = writes - self.function: Union['MethodCall', 'StatelessMethodCall'] = None - self.compile_function() - def call_block(self, *args, **kwargs) -> Any: - assert self.function is not None - return self.function(*args, **kwargs) + def compile(self) -> 'CompiledLocalBlock': + return CompiledLocalBlock(self) - def compile_function(self): + def compile_function(self) -> Callable: local_scope = {} exec(self.to_string(), {}, local_scope) method_name = self.get_method_name() - self.function = local_scope[method_name] - - def merge_with(self, other: 'LocalBlock'): - self.reads.update(other.reads) - self.writes.update(other.writes) - self.compile_function() + return local_scope[method_name] def to_node(self) -> CallLocal: return CallLocal(InvokeMethod(self.get_method_name())) @@ -136,4 +128,29 @@ def body_to_string(self) -> str: if not (v in [ 'self_0','self']): body.append(f'variable_map[\'{v}\'] = {v}') body.append('return None') - return "\n".join(body) \ No newline at end of file + return "\n".join(body) + + +class CompiledLocalBlock: + def __init__(self, block: LocalBlock): + self.method_base_name: str = block.method_base_name + self.block_num: int = block.block_num + self.class_name: str = block.class_name + + self.reads = block.reads + self.writes = block.writes + self.function_string = block.to_string() + self.function: Union['MethodCall', 'StatelessMethodCall'] = block.compile_function() + + def call_block(self, *args, **kwargs) -> Any: + return self.function(*args, **kwargs) + + + # def to_node(self) -> CallLocal: + # return CallLocal(InvokeMethod(self.get_method_name())) + + # def get_method_name(self): + # return f"{self.method_base_name}_{self.block_num}" + + # def get_method_signature(self) -> str: + # return f'variable_map, state' diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index 76f57f0..06bafc9 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -37,7 +37,7 @@ def unparse(block: RawBasicBlock): return repr(block) case nodes.If: print(block.test, block.body, block.orelse) - raise NotImplementedError(type(block)) + raise NotImplementedError(type(block), "Should have been removed in previous CFG pass") case nodes.FunctionDef: return str(block).replace('"', "'") case _: diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index e8f7922..7aadea1 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -63,10 +63,10 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): if isinstance(event.target, CallLocal): logger.debug(event) - if event.dataflow.op_name in self.stateful_ops: - tag = self.stateful_ops[event.dataflow.op_name] + if event.dataflow.operator_name in self.stateful_ops: + tag = self.stateful_ops[event.dataflow.operator_name] else: - tag = self.stateless_ops[event.dataflow.op_name] + tag = self.stateless_ops[event.dataflow.operator_name] else: logger.error(f"FanOut: Wrong target: {event}") @@ -95,7 +95,7 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): assert(isinstance(event.target, CallLocal)) logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Processing: {event.target.method}") - assert(event.dataflow.op_name == self.operator.name()) + assert(event.dataflow.operator_name == self.operator.name()) key = ctx.get_current_key() assert(key is not None) @@ -236,7 +236,7 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): var_map_num_items = self.var_map.value() logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processing: {event}") - total_events = len(event.dataflow.get_predecessors(event.target)) + total_events = len(event.dataflow.get_dataflow().get_predecessors(event.target)) # Add to the map if var_map_num_items == None: diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index 0e48199..c7f48e1 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -71,7 +71,7 @@ def process(self, event: Event): else: self.state[key].append(event) - n = len(event.dataflow.get_predecessors(event.target)) + n = len(event.dataflow.get_dataflow().get_predecessors(event.target)) print(f"PythonCollectOperator: collected {len(self.state[key])}/{n} for event {event._id}") if len(self.state[key]) == n: diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 4fbf785..565391a 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -68,7 +68,7 @@ def add(x: int, y: int): df = sf.build(dataflows, "Test") assert len(df.blocks) == 1 - assert list(df.blocks.values())[0].call_block({"x_0": 3, "y_0":5 }, None) == 8 + assert list(df.blocks.values())[0].compile().call_block({"x_0": 3, "y_0":5 }, None) == 8 def test_state(): @@ -97,7 +97,7 @@ def buy_item(self, item: 'Item') -> bool: blocks = list(df.blocks.values()) assert len(blocks) == 1 - func = blocks[0].call_block + func = blocks[0].compile().call_block print(blocks[0].to_string()) @dataclass @@ -143,7 +143,7 @@ class ComposeReview: req_id: str review_data: dict - func = blocks[0].call_block + func = blocks[0].compile().call_block print(blocks[0].to_string()) diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py index a76bc97..113ab66 100644 --- a/tests/integration/flink/test_collect_operator.py +++ b/tests/integration/flink/test_collect_operator.py @@ -1,6 +1,7 @@ """A test script for dataflows with merge operators""" from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize import tests.integration.flink.utils as utils @@ -31,39 +32,47 @@ def test_collect_operator(): def _test_collect_operator(client, collector): user_op = cascade.core.operators["User"] item_op = cascade.core.operators["Item"] - df = parallelize(user_op.dataflows["buy_2_items"]) - df.name = "buy_2_parallel" - user_op.dataflows["buy_2_parallel"] = df - print(user_op.dataflows["buy_2_parallel"].to_dot()) - print(user_op.dataflows) - assert len(user_op.dataflows["buy_2_parallel"].entry) == 2 + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] - event = user_op.dataflows["__init__"].generate_event({"key": "foo", "balance": 100}, key="foo") + df_parallel = parallelize(user_buy_2) + df_parallel.name = "buy_2_parallel" + cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 2 + + + event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") client.send(event) result = wait_for_event_id(event[0]._id, collector) - print(result.result.__dict__) - event = item_op.dataflows["__init__"].generate_event({"key": "fork", "price": 5}, key="fork") + event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") client.send(event) - event = item_op.dataflows["__init__"].generate_event({"key": "spoon", "price": 3}, key="spoon") + event = item_init.generate_event({"key": "spoon", "price": 3}, key="spoon") client.send(event) result = wait_for_event_id(event[0]._id, collector) - print(result.result.__dict__) # Buy a fork and spoon - event = user_op.dataflows["buy_2_parallel"].generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + print("sending buy 2") + print(df_parallel.to_dot()) + event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + print(event) client.send(event) result = wait_for_event_id(event[0]._id, collector) assert result.result == True # Check the balance - event = user_op.dataflows["get_balance"].generate_event({}, key="foo") + event = user_get_balance.generate_event({}, key="foo") client.send(event) result = wait_for_event_id(event[0]._id, collector) assert result.result == (100 - 5 - 3) + + diff --git a/tests/integration/flink/test_operators.py b/tests/integration/flink/test_operators.py new file mode 100644 index 0000000..8c92616 --- /dev/null +++ b/tests/integration/flink/test_operators.py @@ -0,0 +1,97 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataflowRef, Event +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id + +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_stateful_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime, client = utils.init_flink_runtime("tests.integration.common") + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_stateful_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_stateful_operator(client, collector): + + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + + event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result.__dict__) + + event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_init.generate_event({"key": "spoon", "price": 3}, key="spoon") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result.__dict__) + + + print(user_buy_2.to_dot()) + + # Buy a fork and spoon + event = user_buy_2.generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == True + + + # Check the balance + event = user_get_balance.generate_event({}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == (100 - 5 - 3) + + +@pytest.mark.integration +def test_stateless_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime, client = utils.init_flink_runtime("tests.integration.stateless") + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_stateless_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_stateless_operator(client, collector): + user_op = cascade.core.operators["SomeStatelessOp"] + event = cascade.core.dataflows[DataflowRef("SomeStatelessOp", "get")].generate_event({}) + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + assert result.result == 42 \ No newline at end of file diff --git a/tests/integration/flink/test_stateful_operators.py b/tests/integration/flink/test_stateful_operators.py deleted file mode 100644 index b7d8479..0000000 --- a/tests/integration/flink/test_stateful_operators.py +++ /dev/null @@ -1,63 +0,0 @@ -"""A test script for dataflows with merge operators""" - -from pyflink.datastream.data_stream import CloseableIterator -import tests.integration.flink.utils as utils -from tests.integration.flink.utils import wait_for_event_id - -import pytest - -import cascade -import logging - -@pytest.mark.integration -def test_stateful_operator(): - logger = logging.getLogger("cascade") - logger.setLevel("DEBUG") - - utils.create_topics() - - runtime, client = utils.init_flink_runtime("tests.integration.common") - collector = runtime.run(run_async=True, output="collect") - assert isinstance(collector, CloseableIterator) - - try: - _test_stateful_operator(client, collector) - finally: - collector.close() - client.close() - - -def _test_stateful_operator(client, collector): - - user_op = cascade.core.operators["User"] - item_op = cascade.core.operators["Item"] - event = user_op.dataflows["__init__"].generate_event({"key": "foo", "balance": 100}, key="foo") - client.send(event) - - result = wait_for_event_id(event[0]._id, collector) - print(result.result.__dict__) - - event = item_op.dataflows["__init__"].generate_event({"key": "fork", "price": 5}, key="fork") - client.send(event) - - event = item_op.dataflows["__init__"].generate_event({"key": "spoon", "price": 3}, key="spoon") - client.send(event) - - result = wait_for_event_id(event[0]._id, collector) - print(result.result.__dict__) - - - print(user_op.dataflows["buy_2_items"].to_dot()) - - # Buy a fork and spoon - event = user_op.dataflows["buy_2_items"].generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") - client.send(event) - result = wait_for_event_id(event[0]._id, collector) - assert result.result == True - - - # Check the balance - event = user_op.dataflows["get_balance"].generate_event({}, key="foo") - client.send(event) - result = wait_for_event_id(event[0]._id, collector) - assert result.result == (100 - 5 - 3) diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index b6d4ed9..e2307a6 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -2,6 +2,7 @@ import cascade import sys +from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime from tests.integration.pyruntime.utils import init_python_runtime @@ -13,28 +14,31 @@ def test_checkout_item(): runtime, client = init_python_runtime(file_name) item_op = cascade.core.operators["Item"] user_op = cascade.core.operators["User"] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_buy_item = cascade.core.dataflows[DataflowRef("User", "buy_item")] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] - event = item_op.dataflows["__init__"].generate_event({"item_name": "fork", "price": 10}, key="fork") + event = item_init.generate_event({"item_name": "fork", "price": 10}, key="fork") result = client.send(event) assert result.price == 10 assert result.item_name == "fork" - event = item_op.dataflows["__init__"].generate_event({"item_name": "spoon", "price": 20}, key="spoon") + event = item_init.generate_event({"item_name": "spoon", "price": 20}, key="spoon") result = client.send(event) assert result.price == 20 assert result.__key__() == "spoon" - event = user_op.dataflows["__init__"].generate_event({"username": "test", "balance": 15}, key="test") + event = user_init.generate_event({"username": "test", "balance": 15}, key="test") user = client.send(event) assert user.balance == 15 assert user.__key__() == "test" - event = user_op.dataflows["buy_item"].generate_event({"item_0": "fork"}, key=user.__key__()) + event = user_buy_item.generate_event({"item_0": "fork"}, key=user.__key__()) result = client.send(event) assert runtime.statefuloperators["User"].states["test"]["balance"] == 5 assert result - event = user_op.dataflows["buy_item"].generate_event({"item_0": "spoon"}, key=user.__key__()) + event = user_buy_item.generate_event({"item_0": "spoon"}, key=user.__key__()) result = client.send(event) assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 assert not result @@ -46,55 +50,65 @@ def test_operator_chaining(): a_op = cascade.core.operators["A"] b_op = cascade.core.operators["B"] c_op = cascade.core.operators["C"] + a_init = cascade.core.dataflows[DataflowRef("A", "__init__")] + b_init = cascade.core.dataflows[DataflowRef("B", "__init__")] + c_init = cascade.core.dataflows[DataflowRef("C", "__init__")] + c_get = cascade.core.dataflows[DataflowRef("C", "get")] + b_call_c = cascade.core.dataflows[DataflowRef("B", "call_c")] + a_call_c = cascade.core.dataflows[DataflowRef("A", "call_c_thru_b")] - event = a_op.dataflows["__init__"].generate_event({"key": "aaa"}, key="aaa") + event = a_init.generate_event({"key": "aaa"}, key="aaa") result = client.send(event) assert result.key == "aaa" - event = b_op.dataflows["__init__"].generate_event({"key": "bbb"}, key="bbb") + event = b_init.generate_event({"key": "bbb"}, key="bbb") result = client.send(event) assert result.key == "bbb" - event = c_op.dataflows["__init__"].generate_event({"key": "ccc"}, key="ccc") + event = c_init.generate_event({"key": "ccc"}, key="ccc") result = client.send(event) assert result.key == "ccc" - event = c_op.dataflows["get"].generate_event({"y_0": 0}, key="ccc") + event = c_get.generate_event({"y_0": 0}, key="ccc") result = client.send(event) assert result == 42 print("Call C") - event = b_op.dataflows["call_c"].generate_event({ "c_0": "ccc"}, key="bbb") + event = b_call_c.generate_event({ "c_0": "ccc"}, key="bbb") print(event) result = client.send(event) assert result == 42 print("call C thru B") - event = a_op.dataflows["call_c_thru_b"].generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") + event = a_call_c.generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") result = client.send(event) assert result == 84 def test_branches(): file_name = "if_else_branches.py" + raise NotImplementedError("finish if else branhces test") runtime, client = init_python_runtime(file_name) item_op = cascade.core.operators["Item"] user_op = cascade.core.operators["User"] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] for df in user_op.dataflows.values(): print(df.to_dot()) - event = item_op.dataflows["__init__"].generate_event({"item_name": "fork", "price": 10}, key="fork") + event = item_init.generate_event({"item_name": "fork", "price": 10}, key="fork") result = client.send(event) assert result.price == 10 assert result.item_name == "fork" - event = item_op.dataflows["__init__"].generate_event({"item_name": "spoon", "price": 20}, key="spoon") + event = item_init.generate_event({"item_name": "spoon", "price": 20}, key="spoon") result = client.send(event) assert result.price == 20 assert result.__key__() == "spoon" - event = user_op.dataflows["__init__"].generate_event({"username": "test", "balance": 15}, key="test") + event = user_init.generate_event({"username": "test", "balance": 15}, key="test") user = client.send(event) assert user.balance == 15 - assert user.__key__() == "test" \ No newline at end of file + assert user.__key__() == "test" + diff --git a/tests/integration/stateless.py b/tests/integration/stateless.py new file mode 100644 index 0000000..2af9507 --- /dev/null +++ b/tests/integration/stateless.py @@ -0,0 +1,8 @@ +import cascade + + +@cascade.cascade +class SomeStatelessOp: + @staticmethod + def get() -> int: + return 42 \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index 941dfce..2e557bc 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -6,6 +6,7 @@ # import cascade sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) +from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime import cascade @@ -24,16 +25,17 @@ def test_parallelize(): test_op = cascade.core.operators["Test"] adder_op = cascade.core.operators["Adder"] stock_op = cascade.core.operators["Stock"] - df = test_op.dataflows["get_total"] + stock_init = cascade.core.dataflows[DataflowRef("Stock", "__init__")] + df = cascade.core.dataflows[DataflowRef("Test", "get_total")] print(df) print(df.nodes) - df = parallelize(test_op.dataflows[df.name]) - df.name = "get_total_parallel" - test_op.dataflows[df.name] = df + df_parallel = parallelize(df) + df_parallel.name = "get_total_parallel" + cascade.core.dataflows[DataflowRef("Test", "get_total_parallel")] = df_parallel - assert len(test_op.dataflows["get_total_parallel"].entry) == 2 - assert len(test_op.dataflows["get_total"].entry) == 1 + assert len(df_parallel.entry) == 2 + assert len(df.entry) == 1 runtime = PythonRuntime() runtime.add_stateless_operator(test_op) @@ -43,17 +45,17 @@ def test_parallelize(): client = PythonClientSync(runtime) - event = stock_op.dataflows["__init__"].generate_event({"item": "fork", "quantity": 10}, key="fork") + event = stock_init.generate_event({"item": "fork", "quantity": 10}, key="fork") result = client.send(event) - event = stock_op.dataflows["__init__"].generate_event({"item": "spoon", "quantity": 20}, key="spoon") + event = stock_init.generate_event({"item": "spoon", "quantity": 20}, key="spoon") result = client.send(event) - event = test_op.dataflows["get_total"].generate_event({"item1_0": "fork", "item2_0": "spoon"}) + event = df.generate_event({"item1_0": "fork", "item2_0": "spoon"}) result = client.send(event) assert result == 30 - event = test_op.dataflows["get_total_parallel"].generate_event({"item1_0": "fork", "item2_0": "spoon"}) + event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}) result = client.send(event) assert result == 30 From 0cdc9a72ad6eabbe448c5d2d72f412849f903228 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Mon, 7 Apr 2025 17:35:26 +0200 Subject: [PATCH 23/37] Improve if/else test coverage --- src/cascade/frontend/cfg/cfg_builder.py | 3 + .../frontend/cfg/control_flow_graph.py | 2 +- .../frontend/generator/dataflow_builder.py | 34 ++-- src/cascade/runtime/flink_runtime.py | 2 +- .../dataflow_analysis/test_branches.py | 167 ++++++++++++++++++ .../dataflow_analysis/test_entities.py | 8 +- tests/frontend/test_frontend.py | 5 - .../integration/pyruntime/if_else_branches.py | 48 ----- tests/integration/pyruntime/test_programs.py | 29 --- .../{test_ops.py => entities.py} | 0 tests/optimizations/test_parallelize.py | 2 +- 11 files changed, 200 insertions(+), 100 deletions(-) create mode 100644 tests/frontend/dataflow_analysis/test_branches.py delete mode 100644 tests/frontend/test_frontend.py delete mode 100644 tests/integration/pyruntime/if_else_branches.py rename tests/optimizations/{test_ops.py => entities.py} (100%) diff --git a/src/cascade/frontend/cfg/cfg_builder.py b/src/cascade/frontend/cfg/cfg_builder.py index 390abd0..28c3c24 100644 --- a/src/cascade/frontend/cfg/cfg_builder.py +++ b/src/cascade/frontend/cfg/cfg_builder.py @@ -36,6 +36,9 @@ def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: graph.append_subgraph(cond, subgraph_body, type=True) graph.append_subgraph(cond, subgraph_orelse, type=False) + if subgraph_orelse.graph.number_of_nodes() == 0: + raise NotImplementedError("dataflow structure for if without else is not correct yet") + # The next node should connect to both subgraph graph._last_node = subgraph_body._last_node + subgraph_orelse._last_node else: diff --git a/src/cascade/frontend/cfg/control_flow_graph.py b/src/cascade/frontend/cfg/control_flow_graph.py index 060831c..af26c69 100644 --- a/src/cascade/frontend/cfg/control_flow_graph.py +++ b/src/cascade/frontend/cfg/control_flow_graph.py @@ -38,7 +38,7 @@ def append_statement(self, node: Statement): def append_subgraph(self, to_node: Statement, subgraph: 'ControlFlowGraph', **edge_attr): - if subgraph.graph.number_of_nodes == 0: + if subgraph.graph.number_of_nodes() == 0: return for node in subgraph.get_nodes(): self.graph.add_node(node) diff --git a/src/cascade/frontend/generator/dataflow_builder.py b/src/cascade/frontend/generator/dataflow_builder.py index 894606e..6875a09 100644 --- a/src/cascade/frontend/generator/dataflow_builder.py +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -143,26 +143,38 @@ def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: # check the first node after completed succ_then = list(statement_graph.successors(last_then[-1])) succ_orelse = list(statement_graph.successors(last_orelse[-1])) - assert len(succ_then) == 1 - assert len(succ_orelse) == 1 - assert succ_orelse[0] == succ_then[0] + + if len(succ_then) == 1 and len(succ_orelse) == 1: + assert succ_orelse[0] == succ_then[0] + + assert len(succ_then) <= 1 + assert len(succ_orelse) <= 1 - first_finally = succ_orelse[0] - finally_graph = blocked_cfg(statement_graph, first_finally) + + # add then and orelse blocks graph.add_edges_from(then_blocked_graph.edges()) graph.add_edges_from(orelse_blocked_graph.edges()) - graph.add_edges_from(finally_graph.edges()) - + # connect them to this node first_then = list(then_blocked_graph.nodes)[0] first_orelse = list(orelse_blocked_graph.nodes)[0] - first_finally = list(finally_graph.nodes)[0] - graph.add_edge(last_node, first_then) graph.add_edge(last_node, first_orelse) - graph.add_edge(last_then, first_finally) - graph.add_edge(last_orelse, first_finally) + + # connect the rest of the graph at the end (recursively) + if len(succ_then) == 1 or len(succ_orelse) == 1: + try: + first_finally = succ_orelse[0] + except IndexError: + first_finally = succ_then[0] + finally_graph = blocked_cfg(statement_graph, first_finally) + graph.add_edges_from(finally_graph.edges()) + first_finally = list(finally_graph.nodes)[0] + + graph.add_edge(last_then, first_finally) + graph.add_edge(last_orelse, first_finally) + return graph else: raise ValueError(f"We expect a CFG node to have max 2 successors, got {succ}") diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index 7aadea1..e49093e 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -165,7 +165,7 @@ def process_element(self, event: Event, ctx: ProcessFunction.Context): if isinstance(event.target.method, InvokeMethod): result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map) else: - raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method_type}") + raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method}") new_events = list(event.propogate(result)) diff --git a/tests/frontend/dataflow_analysis/test_branches.py b/tests/frontend/dataflow_analysis/test_branches.py new file mode 100644 index 0000000..a5f54cc --- /dev/null +++ b/tests/frontend/dataflow_analysis/test_branches.py @@ -0,0 +1,167 @@ +from textwrap import dedent + +from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode +from cascade.frontend.generator.dataflow_builder import DataflowBuilder +from cascade.frontend.util import setup_cfg +from klara.core import nodes + + +def test_easy_branching(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond = self.balance - item_price >= 0 + if cond: + self.balance = self.balance - item_price + else: + x = 10 + return self.balance""") + cfg = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 6 + ifnode = None + for node in df.nodes.values(): + if isinstance(node, IfNode): + assert ifnode is None + ifnode = node + + assert ifnode is not None + assert len(ifnode.outgoing_edges) == 2 + + +def test_complex_predicate(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + if self.balance >= item_price: + self.balance = self.balance - item_price + else: + x = 10 + return self.balance""") + cfg = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 6, "complex predicate should create a temp variable assignment" + + +def test_multiple_return(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond = self.balance - item_price >= 0 + if cond: + item_price = item.get_price() + self.balance = self.balance - item_price + return "item bought" + else: + item_price = item.get_price() + msg = str(item_price) + " is too expensive!" + return msg""") + cfg = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + +def test_no_else(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond1 = self.balance - item_price >= 0 + if cond1: + item_price = item.get_price() + self.balance = self.balance - item_price + x = 0 + return item_price""") + cfg = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 6 + +def test_nested(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond1 = self.balance - item_price >= 0 + if cond1: + item_price = item.get_price() + if True: + x = 20 + self.balance = self.balance - item_price + return "item bought" + else: + if True: + x = 20 + else: + x = 30 + item_price = item.get_price() + msg = "item is too expensive!" + return msg""") + cfg = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 12 \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 565391a..1bc3df6 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -20,7 +20,7 @@ def get_total(item1: Stock, item2: Stock): return a+b""") cfg: Cfg = setup_cfg(program) blocks = cfg.block_list - test_class: nodes.Block = blocks[2] + test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] sf = DataflowBuilder(get_total) @@ -56,7 +56,7 @@ def add(x: int, y: int): return x+y""") cfg: Cfg = setup_cfg(program) blocks = cfg.block_list - test_class: nodes.Block = blocks[2] + test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] sf = DataflowBuilder(get_total) @@ -82,7 +82,7 @@ def buy_item(self, item: 'Item') -> bool: cfg: Cfg = setup_cfg(program) blocks = cfg.block_list - user_class: nodes.Block = blocks[2] + user_class = blocks[2] buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] sf = DataflowBuilder(buy_item) @@ -121,7 +121,7 @@ def upload_unique_id(self, review_id: int): cfg: Cfg = setup_cfg(program) blocks = cfg.block_list - user_class: nodes.Block = blocks[2] + user_class = blocks[2] upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] sf = DataflowBuilder(upload_unique) diff --git a/tests/frontend/test_frontend.py b/tests/frontend/test_frontend.py deleted file mode 100644 index b5f16f7..0000000 --- a/tests/frontend/test_frontend.py +++ /dev/null @@ -1,5 +0,0 @@ -def test_ifs(): - pass - -def test_whiles(): - pass \ No newline at end of file diff --git a/tests/integration/pyruntime/if_else_branches.py b/tests/integration/pyruntime/if_else_branches.py deleted file mode 100644 index 024a8b8..0000000 --- a/tests/integration/pyruntime/if_else_branches.py +++ /dev/null @@ -1,48 +0,0 @@ -import cascade - -@cascade.cascade -class User: - def __init__(self, username: str, balance: int): - self.username = username - self.balance = balance - - def buy_item_easy(self, item: 'Item') -> int: - item_price = item.get_price() - cond = self.balance - item_price >= 0 - if cond: - self.balance = self.balance - item_price - else: - x = 10 - return self.balance - - # def buy_item_pred(self, item: 'Item') -> int: - # item_price = item.get_price() - # if self.balance - item_price >= 0: - # self.balance = self.balance - item_price - # return self.balance - - # def buy_item_else(self, item: 'Item') -> str: - # item_price = item.get_price() - # if self.balance - item_price >= 0: - # item_price = item.get_price() - # self.balance = self.balance - item_price - # return "item bought" - # else: - # item_price = item.get_price() - # msg = str(item_price) + " is too expensive!" - # return msg - - def __key__(self) -> str: - return self.username - -@cascade.cascade -class Item: - def __init__(self, item_name: str, price: int): - self.item_name = item_name - self.price = price - - def get_price(self) -> int: - return self.price - - def __key__(self) -> str: - return self.item_name diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index e2307a6..785bb80 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -83,32 +83,3 @@ def test_operator_chaining(): event = a_call_c.generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") result = client.send(event) assert result == 84 - -def test_branches(): - file_name = "if_else_branches.py" - raise NotImplementedError("finish if else branhces test") - - runtime, client = init_python_runtime(file_name) - item_op = cascade.core.operators["Item"] - user_op = cascade.core.operators["User"] - item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] - user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] - - for df in user_op.dataflows.values(): - print(df.to_dot()) - - event = item_init.generate_event({"item_name": "fork", "price": 10}, key="fork") - result = client.send(event) - assert result.price == 10 - assert result.item_name == "fork" - - event = item_init.generate_event({"item_name": "spoon", "price": 20}, key="spoon") - result = client.send(event) - assert result.price == 20 - assert result.__key__() == "spoon" - - event = user_init.generate_event({"username": "test", "balance": 15}, key="test") - user = client.send(event) - assert user.balance == 15 - assert user.__key__() == "test" - diff --git a/tests/optimizations/test_ops.py b/tests/optimizations/entities.py similarity index 100% rename from tests/optimizations/test_ops.py rename to tests/optimizations/entities.py diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index 2e557bc..7ac51bf 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -16,7 +16,7 @@ def test_parallelize(): assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ Module" # import the module - import_module_name: str = 'test_ops' + import_module_name: str = 'entities' exec(f'import tests.optimizations.{import_module_name}') cascade.core.init() From 243d3dd4159339966dcdbf15b8c0e7fae6573382 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 8 Apr 2025 12:04:33 +0200 Subject: [PATCH 24/37] Add ifnode to integration to pyruntime --- src/cascade/dataflow/dataflow.py | 41 ++++++++++++++++--- .../frontend/generator/dataflow_builder.py | 18 ++++---- src/cascade/runtime/python_runtime.py | 9 ++-- .../dataflow_analysis/test_split_functions.py | 16 +++++--- tests/integration/pyruntime/branching.py | 24 +++++++++++ tests/integration/pyruntime/test_programs.py | 16 ++++++++ 6 files changed, 100 insertions(+), 24 deletions(-) create mode 100644 tests/integration/pyruntime/branching.py diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index bdcc9d5..2c4b57c 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -44,9 +44,35 @@ def __post_init__(self): def propogate(self, event: 'Event', targets: list['Node'], result: Any, **kwargs) -> list['Event']: pass +@dataclass class IfNode(Node): + predicate_var: str + def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: - return super().propogate(event, targets, result, **kwargs) + + if_cond = event.variable_map[self.predicate_var] + print(self.predicate_var) + print(if_cond) + targets = [] + for edge in event.target.outgoing_edges: + assert edge.if_conditional is not None + if edge.if_conditional == if_cond: + targets.append(edge.to_node) + + + events = [] + for target in targets: + ev = Event( + target, + event.variable_map, + event.dataflow, + call_stack=event.call_stack, + _id=event._id, + metadata=event.metadata, + key=event.key) + + events.append(ev) + return events @dataclass class DataflowRef: @@ -295,9 +321,14 @@ def to_dot(self) -> str: lines.append(f' {node.id} [label="{node}"];') # Add edges - for from_id, to_ids in self.adjacency_list.items(): - for to_id in to_ids: - lines.append(f" {from_id} -> {to_id};") + for node in self.nodes.values(): + for edge in node.outgoing_edges: + + line = f" {edge.from_node.id} -> {edge.to_node.id}" + if edge.if_conditional is not None: + line += f' [label="{edge.if_conditional}"]' + line += ";" + lines.append(line) lines.append("}") return "\n".join(lines) @@ -411,7 +442,7 @@ def propogate(self, result: Any) -> Iterable[Union['EventResult', 'Event']]: events = current_node.propogate(self, targets, result) for event in events: - if isinstance(event.target, CallEntity): + if isinstance(event.target, CallEntity) or isinstance(event.target, IfNode): # recursively propogate CallEntity events yield from event.propogate(None) else: diff --git a/src/cascade/frontend/generator/dataflow_builder.py b/src/cascade/frontend/generator/dataflow_builder.py index 6875a09..a2cc510 100644 --- a/src/cascade/frontend/generator/dataflow_builder.py +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -48,6 +48,8 @@ def split_cfg(blocked_statement_graph: nx.DiGraph) -> nx.DiGraph: split_graph: nx.DiGraph = blocked_statement_graph.copy() for node in list(split_graph.nodes): in_nodes = split_graph.predecessors(node) + in_edges = list(split_graph.in_edges(node, data=True)) + out_edges = list(split_graph.out_edges(node, data=True)) out_nodes = split_graph.successors(node) # create the new nodes @@ -62,10 +64,10 @@ def split_cfg(blocked_statement_graph: nx.DiGraph) -> nx.DiGraph: u = v # connect the outer edges - for u in in_nodes: - split_graph.add_edge(u, new_nodes[0]) - for v in out_nodes: - split_graph.add_edge(new_nodes[-1], v) + for u, v, ddict in in_edges: + split_graph.add_edge(u, new_nodes[0], **ddict) + for u, v, ddict in out_edges: + split_graph.add_edge(new_nodes[-1], v, **ddict) return split_graph @@ -159,8 +161,8 @@ def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: # connect them to this node first_then = list(then_blocked_graph.nodes)[0] first_orelse = list(orelse_blocked_graph.nodes)[0] - graph.add_edge(last_node, first_then) - graph.add_edge(last_node, first_orelse) + graph.add_edge(last_node, first_then, type=True) + graph.add_edge(last_node, first_orelse, type=False) # connect the rest of the graph at the end (recursively) if len(succ_then) == 1 or len(succ_orelse) == 1: @@ -211,7 +213,9 @@ def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> Data if len(statement_block) == 1 and statement_block[0].is_remote(): node = to_entity_call(statement_block[0], self.type_map, dataflows) elif len(statement_block) == 1 and statement_block[0].is_predicate: - node = IfNode() + rawblock = statement_block[0].block + assert isinstance(rawblock, nodes.Bool), type(rawblock) + node = IfNode(repr(rawblock.value)) else: block = LocalBlock(list(statement_block), self.name, block_num, op_name) block_num += 1 diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index c7f48e1..85126e2 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -116,15 +116,12 @@ def consume_event(event: Event): yield from self.statefuloperators[event.dataflow.operator_name].process(event) else: yield from self.statelessoperators[event.dataflow.operator_name].process(event) - elif isinstance(event.target, CallEntity): - new_events = event.propogate(None) - if isinstance(new_events, EventResult): - yield new_events - else: - yield from new_events elif isinstance(event.target, CollectNode): yield from self.collect.process(event) + + else: + raise ValueError(f"Event target type can only be CallLocal or CollectNode, not {event.target}") events = [] diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index a41c489..893ec81 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -72,6 +72,10 @@ def test_branching(self) -> int: print(sf.cfg.to_dot()) new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) + print_digraph(new) + + print_digraph(split_cfg(new)) + assert len(new.nodes) == 5 dataflows = { @@ -86,17 +90,17 @@ def test_branching(self) -> int: assert len(df.nodes) == 5 assert len(df.blocks) == 4 -def print_digraph(graph): +def print_digraph(graph: nx.DiGraph): for node in graph.nodes: for s in node: print(s.block_num, end=" ") print() - for edge in graph.edges: - for s in edge[0]: - print(s.block_num, end=" ") + for u, v, c in graph.edges.data('type', default=None): + print(u[0].block_num, end=" ") print("->", end= " ") - for s in edge[1]: - print(s.block_num, end=" ") + print(v[0].block_num, end=" ") + if c is not None: + print(f' [label="{c}"]', end=" ") print() def test_branching_with_entity_calls(): diff --git a/tests/integration/pyruntime/branching.py b/tests/integration/pyruntime/branching.py new file mode 100644 index 0000000..642c755 --- /dev/null +++ b/tests/integration/pyruntime/branching.py @@ -0,0 +1,24 @@ +import cascade + +@cascade.cascade +class Brancher: + @staticmethod + def branch(cond: bool) -> int: + x = 10 + if cond: + r = Remote.get() + return r + else: + return 42 + + +@cascade.cascade +class Remote: + @staticmethod + def get() -> int: + return 33 + + + + + \ No newline at end of file diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index 785bb80..49a5145 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -83,3 +83,19 @@ def test_operator_chaining(): event = a_call_c.generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") result = client.send(event) assert result == 84 + + +def test_branching_integration(): + file_name = "branching.py" + + runtime, client = init_python_runtime(file_name) + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch")] + print(branch.to_dot()) + + event = branch.generate_event({"cond_0": True}) + result = client.send(event) + assert result == 33 + + event = branch.generate_event({"cond_0": False}) + result = client.send(event) + assert result == 42 \ No newline at end of file From 0ae5708a32d8b8497422e903aebe5b197a23df0b Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 8 Apr 2025 12:19:01 +0200 Subject: [PATCH 25/37] Add branching test to pyflink integration tests --- .../integration/{pyruntime => }/branching.py | 0 tests/integration/flink/test_branching.py | 44 +++++++++++++++++++ tests/integration/pyruntime/test_programs.py | 6 +-- tests/integration/pyruntime/utils.py | 3 +- 4 files changed, 48 insertions(+), 5 deletions(-) rename tests/integration/{pyruntime => }/branching.py (100%) create mode 100644 tests/integration/flink/test_branching.py diff --git a/tests/integration/pyruntime/branching.py b/tests/integration/branching.py similarity index 100% rename from tests/integration/pyruntime/branching.py rename to tests/integration/branching.py diff --git a/tests/integration/flink/test_branching.py b/tests/integration/flink/test_branching.py new file mode 100644 index 0000000..ccffb16 --- /dev/null +++ b/tests/integration/flink/test_branching.py @@ -0,0 +1,44 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize + +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_branching_pyflink(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime, client = utils.init_flink_runtime("tests.integration.branching") + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_branching(client, collector) + finally: + collector.close() + client.close() + + +def _test_branching(client, collector): + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch")] + print(branch.to_dot()) + + event = branch.generate_event({"cond_0": True}) + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == 33 + + event = branch.generate_event({"cond_0": False}) + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == 42 \ No newline at end of file diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index 49a5145..c11fa43 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -9,7 +9,7 @@ def test_checkout_item(): - file_name = "checkout_item.py" + file_name = "tests.integration.pyruntime.checkout_item" runtime, client = init_python_runtime(file_name) item_op = cascade.core.operators["Item"] @@ -44,7 +44,7 @@ def test_checkout_item(): assert not result def test_operator_chaining(): - file_name = "operator_chaining.py" + file_name = "tests.integration.pyruntime.operator_chaining" runtime, client = init_python_runtime(file_name) a_op = cascade.core.operators["A"] @@ -86,7 +86,7 @@ def test_operator_chaining(): def test_branching_integration(): - file_name = "branching.py" + file_name = "tests.integration.branching" runtime, client = init_python_runtime(file_name) branch = cascade.core.dataflows[DataflowRef("Brancher", "branch")] diff --git a/tests/integration/pyruntime/utils.py b/tests/integration/pyruntime/utils.py index 686a5f9..a4261f8 100644 --- a/tests/integration/pyruntime/utils.py +++ b/tests/integration/pyruntime/utils.py @@ -7,9 +7,8 @@ from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -def init_python_runtime(file_name: str) -> tuple[PythonRuntime, PythonClientSync]: +def init_python_runtime(import_module_name: str) -> tuple[PythonRuntime, PythonClientSync]: cascade.core.clear() - import_module_name: str = f'tests.integration.pyruntime.{file_name.strip(".py")}' exec(f'import {import_module_name}') cascade.core.init() From 9fa0d2154fba587e31a5f6c6f90d8ea083732129 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 8 Apr 2025 17:52:17 +0200 Subject: [PATCH 26/37] Run experiments in new IR --- deathstar_movie_review/demo.py | 70 ++---------- deathstar_movie_review/start_benchmark.py | 78 +++++++------ .../test_movie_review_demo.py | 27 +++-- run_experiments_gil_workaround.py | 5 +- src/cascade/core.py | 4 +- src/cascade/dataflow/dataflow.py | 49 ++++---- src/cascade/dataflow/operator.py | 12 +- .../dataflow/optimization/parallelization.py | 6 +- .../frontend/generator/dataflow_builder.py | 2 +- src/cascade/frontend/generator/local_block.py | 4 +- src/cascade/runtime/flink_runtime.py | 108 ++++++++++++------ src/cascade/runtime/python_runtime.py | 10 +- .../dataflow_analysis/test_entities.py | 10 +- .../dataflow_analysis/test_split_functions.py | 6 +- tests/integration/branching.py | 8 ++ tests/integration/flink/test_branching.py | 4 +- .../flink/test_collect_operator.py | 23 ++-- tests/integration/flink/test_operators.py | 7 +- tests/integration/flink/utils.py | 19 ++- tests/integration/pyruntime/test_programs.py | 11 ++ 20 files changed, 254 insertions(+), 209 deletions(-) diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 02574f0..0cfbf28 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -1,14 +1,12 @@ from typing import Literal +import cascade +from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination +from cascade.dataflow.optimization.parallelization import parallelize from cascade.runtime.flink_runtime import FlinkRuntime - -from .entities.user import user_op -from .entities.compose_review import compose_review_op -from .entities.frontend import frontend_df_parallel, frontend_df_serial, frontend_op, text_op, unique_id_op -from .entities.movie import movie_id_op, movie_info_op, plot_op +from tests.integration.flink.utils import create_topics, init_flink_runtime import os -from confluent_kafka.admin import AdminClient, NewTopic KAFKA_BROKER = "localhost:9092" KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different @@ -19,66 +17,22 @@ EXPERIMENT: Literal["baseline", "pipelined", "parallel"] = os.getenv("EXPERIMENT", "baseline") -def create_topics(*required_topics): - conf = { - "bootstrap.servers": KAFKA_BROKER - } - - admin_client = AdminClient(conf) - - # Fetch existing topics - existing_topics = admin_client.list_topics(timeout=5).topics.keys() - - # Find missing topics - missing_topics = [topic for topic in required_topics if topic not in existing_topics] - - if missing_topics: - print(f"Creating missing topics: {missing_topics}") - - # Define new topics (default: 1 partition, replication factor 1) - new_topics = [NewTopic(topic, num_partitions=32, replication_factor=1) for topic in missing_topics] - - # Create topics - futures = admin_client.create_topics(new_topics) - - # Wait for topic creation to complete - for topic, future in futures.items(): - try: - future.result() # Block until the operation is complete - print(f"Topic '{topic}' created successfully") - except Exception as e: - print(f"Failed to create topic '{topic}': {e}") - else: - print("All required topics exist.") - def main(): create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) - runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) - runtime.init(kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True) + runtime = init_flink_runtime("deathstar_movie_review.entities.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=False) print(f"Creating dataflow [{EXPERIMENT}]") - if EXPERIMENT == "baseline": - frontend_op.dataflow = frontend_df_serial() - elif EXPERIMENT == "pipelined": - frontend_op.dataflow = frontend_df_serial() - dead_node_elimination([], [frontend_op]) - elif EXPERIMENT == "parallel": - frontend_op.dataflow = frontend_df_parallel() - else: - raise RuntimeError(f"EXPERIMENT is not set correctly: {EXPERIMENT}") - - runtime.add_operator(compose_review_op) - runtime.add_operator(user_op) - runtime.add_operator(movie_info_op) - runtime.add_operator(movie_id_op) - runtime.add_operator(plot_op) - runtime.add_stateless_operator(frontend_op) - runtime.add_stateless_operator(unique_id_op) - runtime.add_stateless_operator(text_op) + df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel = parallelize(df_baseline) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + runtime.add_dataflow(df_parallel) + print(cascade.core.dataflows.keys()) + runtime.run() if __name__ == "__main__": diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index cdff9b8..21028b4 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -6,6 +6,7 @@ import pandas as pd import random + from .movie_data import movie_data from .workload_data import movie_titles, charset import sys @@ -16,13 +17,11 @@ # import cascade sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) -from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination -from cascade.dataflow.dataflow import Event, EventResult, InitClass, OpNode +from tests.integration.flink.utils import init_cascade_from_module, init_flink_runtime +import cascade +from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.dataflow import DataflowRef,EventResult from cascade.runtime.flink_runtime import FlinkClientSync - -from .entities.user import User -from .entities.frontend import frontend_df_parallel, frontend_df_serial, frontend_op -from .entities.movie import MovieInfo, Plot, MovieId IN_TOPIC = "ds-movie-in" OUT_TOPIC = "ds-movie-out" @@ -34,7 +33,7 @@ # bursts = 100 def populate_user(client: FlinkClientSync): - init_user = OpNode(User, InitClass(), read_key_from="username") + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] for i in range(1000): user_id = f'user{i}' username = f'username_{i}' @@ -54,32 +53,32 @@ def populate_user(client: FlinkClientSync): "Password": password_hash, "Salt": salt } - event = Event(init_user, {"username": username, "user_data": user_data}, None) + event = user_init.generate_event({"username": username, "user_data": user_data}, key=username) client.send(event) def populate_movie(client: FlinkClientSync): - init_movie_info = OpNode(MovieInfo, InitClass(), read_key_from="movie_id") - init_plot = OpNode(Plot, InitClass(), read_key_from="movie_id") - init_movie_id = OpNode(MovieId, InitClass(), read_key_from="title") - + movieinfo_init = cascade.core.dataflows[DataflowRef("MovieInfo", "__init__")] + plot_init = cascade.core.dataflows[DataflowRef("Plot", "__init__")] + movieid_init = cascade.core.dataflows[DataflowRef("MovieId", "__init__")] + for movie in movie_data: movie_id = movie["MovieId"] # movie info -> write `movie` - event = Event(init_movie_info, {"movie_id": movie_id, "info": movie}, None) + event = movieinfo_init.generate_event({"movie_id": movie_id, "info": movie}, key=movie_id) client.send(event) # plot -> write "plot" - event = Event(init_plot, {"movie_id": movie_id, "plot": "plot"}, None) + event = plot_init.generate_event({"movie_id": movie_id, "plot": "plot"}, key=movie_id) client.send(event) # movie_id_op -> register movie id - event = Event(init_movie_id, {"title": movie["Title"], "movie_id": movie_id}, None) + event = movieid_init.generate_event({"title": movie["Title"], "movie_id": movie_id}, key=movie["Title"]) client.send(event) -def compose_review(req_id, op): +def compose_review(req_id, parallel=False): user_index = random.randint(0, 999) username = f"username_{user_index}" password = f"password_{user_index}" @@ -87,26 +86,32 @@ def compose_review(req_id, op): rating = random.randint(0, 10) text = ''.join(random.choice(charset) for _ in range(256)) - return op.dataflow.generate_event({ - "review": req_id, - "user": username, - "title": title, - "rating": rating, - "text": text + if parallel: + compose = cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] + else: + compose = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + + return compose.generate_event({ + "req_id": req_id, # hacky way to create the compose review object when it doesn't exist + "review_0": req_id, + "user_0": username, + "title_0": title, + "rating_0": rating, + "text_0": text }) -def deathstar_workload_generator(op): +def deathstar_workload_generator(parallel=False): c = 1 while True: - yield compose_review(c, op) + yield compose_review(c, parallel) c += 1 def benchmark_runner(args) -> dict[int, dict]: - proc_num, op, requests_per_second, sleep_time, bursts = args + proc_num, requests_per_second, sleep_time, bursts, parallel = args print(f'Generator: {proc_num} starting') client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) - deathstar_generator = deathstar_workload_generator(op) + deathstar_generator = deathstar_workload_generator(parallel) start = timer() for b in range(bursts): @@ -209,19 +214,15 @@ def main(): print(f"Starting with args:\n{args}") print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") - - if EXPERIMENT == "baseline": - frontend_op.dataflow = frontend_df_serial() - elif EXPERIMENT == "pipelined": - frontend_op.dataflow = frontend_df_serial() - dead_node_elimination([], [frontend_op]) - elif EXPERIMENT == "parallel": - frontend_op.dataflow = frontend_df_parallel() - else: - raise RuntimeError(f"EXPERIMENT is not set correctly: {EXPERIMENT}") - + init_cascade_from_module("deathstar_movie_review.entities.entities") init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + + df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel = parallelize(df_baseline) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + print(cascade.core.dataflows.keys()) if not args.no_init: print("Populating...") @@ -233,8 +234,9 @@ def main(): time.sleep(1) print("Starting benchmark") + parallel = args.experiment == "parallel" - func_args = [(t, frontend_op, rps_per_thread, sleep_time, args.seconds) for t in range(args.threads)] + func_args = [(t, rps_per_thread, sleep_time, args.seconds, parallel) for t in range(args.threads)] with Pool(args.threads) as p: results = p.map(benchmark_runner, func_args) diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index b38ab6f..81be421 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -3,8 +3,10 @@ import os + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) +from cascade.runtime.flink_runtime import FlinkClientSync from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize from cascade.dataflow.operator import StatefulOperator, StatelessOperator @@ -43,7 +45,17 @@ def test_deathstar_movie_demo_flink(): utils.create_topics() - runtime, client = utils.init_flink_runtime("deathstar_movie_review.entities.entities") + runtime = utils.init_flink_runtime("deathstar_movie_review.entities.entities") + compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel = parallelize(compose_df) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + runtime.add_dataflow(df_parallel) + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 4 + + + client = FlinkClientSync() runtime.run(run_async=True) try: @@ -52,11 +64,6 @@ def test_deathstar_movie_demo_flink(): client.close() def deathstar_movie_demo(client): - user_op = cascade.core.operators["User"] - compose_op = cascade.core.operators["ComposeReview"] - movie_op = cascade.core.operators["MovieId"] - frontend_op = cascade.core.operators["Frontend"] - compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] for df in cascade.core.dataflows.values(): @@ -127,12 +134,8 @@ def deathstar_movie_demo(client): - ## NOW DO IT PARALLEL! - df_parallel = parallelize(compose_df) - df_parallel.name = "compose_parallel" - cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel - print(df_parallel.to_dot()) - assert len(df_parallel.entry) == 4 + ### PARALLEL ### + df_parallel = cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] # make the review diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index 5db7811..d503a53 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -33,7 +33,7 @@ def mps(num, producer_threads=1): # {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(400, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(600, producer_threads=20)}}, - {"parallelism": 24, "benchmark_args": {**mps(1000, producer_threads=20)}}, + {"parallelism": 1, "benchmark_args": {**mps(10, producer_threads=1)}}, # {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=40)}}, # {"parallelism": 24, "benchmark_args": {**mps(1000, threads=20)}}, ] @@ -44,8 +44,7 @@ def mps(num, producer_threads=1): print("Tearing down docker containers") subprocess.run(["docker", "compose", "down"], check=False) -for e in ["pipelined", "parallel", "baseline"]: -# for e in ["parallel"]: +for e in ["parallel"]: for exp in experiments: print(f"Starting experiment {exp}") diff --git a/src/cascade/core.py b/src/cascade/core.py index 4e09ad3..95bd84f 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -91,9 +91,9 @@ def init(): df = DataflowBuilder(method.method_node).build(dataflows, op_name) dataflows[df.ref()] = df - # op.dataflows[df.name] = df + op.dataflows[df.ref()] = df for name, b in df.blocks.items(): - op.methods[name] = b.compile() + op.methods[name] = b def get_operator(op_name: str): diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 2c4b57c..2e38cc5 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -7,7 +7,7 @@ import cascade if TYPE_CHECKING: - from cascade.frontend.generator.local_block import LocalBlock + from cascade.frontend.generator.local_block import CompiledLocalBlock from cascade.dataflow.operator import Operator @@ -41,14 +41,14 @@ def __post_init__(self): Node._id_counter += 1 @abstractmethod - def propogate(self, event: 'Event', targets: list['Node'], result: Any, **kwargs) -> list['Event']: + def propogate(self, event: 'Event', targets: list['Node'], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> list['Event']: pass @dataclass class IfNode(Node): predicate_var: str - def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: if_cond = event.variable_map[self.predicate_var] print(self.predicate_var) @@ -79,8 +79,11 @@ class DataflowRef: operator_name: str dataflow_name: str - def get_dataflow(self) -> 'DataFlow': - return cascade.core.dataflows[self] + # def get_dataflow(self) -> 'DataFlow': + # try: + # return cascade.core.dataflows[self] + # except KeyError as e: + # raise KeyError(f"DataflowRef {self} not found in cascade.core.dataflows") def __repr__(self) -> str: return f"{self.operator_name}.{self.dataflow_name}" @@ -92,7 +95,7 @@ def __hash__(self) -> int: @dataclass class CallEntity(Node): """A node in a `DataFlow` corresponding to the call of another dataflow""" - dataflow: DataflowRef + dataflow: 'DataflowRef' """The dataflow to call.""" variable_rename: dict[str, str] @@ -104,14 +107,14 @@ class CallEntity(Node): keyby: Optional[str] = None """The key, for calls to Stateful Entities""" - def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow']) -> List['Event']: # remap the variable map of event into the new event new_var_map = {key: event.variable_map[value] for key, value in self.variable_rename.items()} if self.keyby: new_key = event.variable_map[self.keyby] else: new_key = None - df = cascade.core.get_dataflow(self.dataflow) + df = df_map[self.dataflow] new_targets = df.entry if not isinstance(new_targets, list): new_targets = [new_targets] @@ -143,7 +146,7 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['E class CallLocal(Node): method: Union[InvokeMethod, InitClass] - def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: # For simple calls, we only need to change the target. # Multiple targets results in multiple events events = [] @@ -166,8 +169,9 @@ class CollectNode(Node): It will aggregate incoming edges and output them as a list to the outgoing edge. Their actual implementation is runtime-dependent.""" + num_events: int - def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: return [Event( target, event.variable_map, @@ -217,15 +221,15 @@ def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.adjacency_list: dict[int, list[int]] = {} self.nodes: dict[int, Node] = {} self.entry: List[Node] = [] - self.op_name = op_name + self.operator_name = op_name if args: self.args: list[str] = args else: self.args = [] - self.blocks: dict[str, 'LocalBlock'] = {} + self.blocks: dict[str, 'CompiledLocalBlock'] = {} def ref(self) -> DataflowRef: - return DataflowRef(self.op_name, self.name) + return DataflowRef(self.operator_name, self.name) # def get_operator(self) -> Operator: # return cascade.core.operators[self.op_name] @@ -235,7 +239,7 @@ def add_node(self, node: Node): self.adjacency_list[node.id] = [] self.nodes[node.id] = node - def add_block(self, block: 'LocalBlock'): + def add_block(self, block: 'CompiledLocalBlock'): self.blocks[block.get_method_name()] = block def add_edge(self, edge: Edge): @@ -314,7 +318,7 @@ def get_predecessors(self, node: Node) -> List[Node]: def to_dot(self) -> str: """Output the DataFlow graph in DOT (Graphviz) format.""" - lines = [f"digraph {self.op_name}.{self.name} {{"] + lines = [f"digraph {self.operator_name}.{self.name} {{"] # Add nodes for node in self.nodes.values(): @@ -343,8 +347,8 @@ def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None # TODO: propogate at "compile time" instead of doing this every time local_events = [] for ev in events: - if isinstance(ev.target, CallEntity): - local_events.extend(ev.propogate(None)) + if isinstance(ev.target, CallEntity) or isinstance(ev.target, IfNode): + local_events.extend(ev.propogate(None, cascade.core.dataflows)) else: local_events.append(ev) @@ -352,7 +356,7 @@ def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None def __str__(self) -> str: - return f"{self.op_name}.{self.name}" + return f"{self.operator_name}.{self.name}" def metadata_dict() -> dict: return { @@ -404,9 +408,10 @@ def __post_init__(self): # Assign a unique ID self._id = uuid.uuid4().int - def propogate(self, result: Any) -> Iterable[Union['EventResult', 'Event']]: + def propogate(self, result: Any, df_map: dict['DataflowRef','DataFlow']) -> Iterable[Union['EventResult', 'Event']]: """Propogate this event through the Dataflow.""" - targets = self.dataflow.get_dataflow().get_neighbors(self.target) + targets = df_map[self.dataflow].get_neighbors(self.target) + events = [] @@ -439,12 +444,12 @@ def propogate(self, result: Any) -> Iterable[Union['EventResult', 'Event']]: return else: current_node = self.target - events = current_node.propogate(self, targets, result) + events = current_node.propogate(self, targets, result, df_map) for event in events: if isinstance(event.target, CallEntity) or isinstance(event.target, IfNode): # recursively propogate CallEntity events - yield from event.propogate(None) + yield from event.propogate(None, df_map) else: yield event @dataclass diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index f6c8580..03b0576 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -1,14 +1,15 @@ from abc import ABC, abstractmethod from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, TYPE_CHECKING + if TYPE_CHECKING: from cascade.frontend.generator.local_block import CompiledLocalBlock - from cascade.dataflow.dataflow import DataFlow, InvokeMethod + from cascade.dataflow.dataflow import DataFlow, InvokeMethod, DataflowRef T = TypeVar('T') class Operator(ABC): - dataflows: dict[str, 'DataFlow'] + dataflows: dict['DataflowRef', 'DataFlow'] methods: Mapping[str, 'CompiledLocalBlock'] @abstractmethod @@ -57,7 +58,7 @@ class StatefulOperator(Generic[T], Operator): methods, instead reading and modifying the underlying class `T` through a state variable, see `handle_invoke_method`. """ - def __init__(self, entity: Type[T], methods: dict[str, 'CompiledLocalBlock'], dataflows: dict[str, 'DataFlow']): + def __init__(self, entity: Type[T], methods: dict[str, 'CompiledLocalBlock'], dataflows: dict['DataflowRef', 'DataFlow']): """Create the StatefulOperator from a class and its compiled methods. Typically, a class could be comprised of split and non-split methods. Take the following example: @@ -106,7 +107,6 @@ def user_buy_item_1(variable_map: dict[str, Any], state: User): ``` """ - # methods maps function name to a function. Ideally this is done once in the object self.methods = methods self.entity = entity self.dataflows = dataflows @@ -137,11 +137,11 @@ def name(self): class StatelessOperator(Operator): """A StatelessOperator refers to a stateless function and therefore only has one dataflow.""" - def __init__(self, entity: Type, methods: dict[str, 'CompiledLocalBlock'], dataflows: dict[str, 'DataFlow']): + def __init__(self, entity: Type, methods: dict[str, 'CompiledLocalBlock'], dataflows: dict['DataflowRef', 'DataFlow']): self.entity = entity # TODO: extract this from dataflows.blocks self.methods = methods - # self.dataflows = dataflows + self.dataflows = dataflows pass def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, Any]): diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 444fc63..574810e 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -204,7 +204,7 @@ def parallelize(df: DataFlow): reads = set(node.variable_rename.values()) writes = {result} if (result := node.assign_result_to) else set() elif isinstance(node, CallLocal): - operator = cascade.core.operators[df.op_name] + operator = cascade.core.operators[df.operator_name] method = df.blocks[node.method.method_name] reads = method.reads writes = method.writes @@ -230,7 +230,7 @@ def parallelize(df: DataFlow): except KeyError: pass - updated = DataFlow(df.name, df.op_name) + updated = DataFlow(df.name, df.operator_name) updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] prev_node = None @@ -253,7 +253,7 @@ def parallelize(df: DataFlow): # TODO: maybe collect node should just infer from it's predecessors? # like it can only have DataFlowNode predecessors # TODO: rename DataflowNode to EntityCall - collect_node = CollectNode() + collect_node = CollectNode(len(nodes_with_indegree_0)) for node_id in nodes_with_indegree_0: if prev_node: updated.add_edge(Edge(prev_node, n_map[node_id])) diff --git a/src/cascade/frontend/generator/dataflow_builder.py b/src/cascade/frontend/generator/dataflow_builder.py index a2cc510..a6ef184 100644 --- a/src/cascade/frontend/generator/dataflow_builder.py +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -220,7 +220,7 @@ def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> Data block = LocalBlock(list(statement_block), self.name, block_num, op_name) block_num += 1 node = block.to_node() - df.add_block(block) + df.add_block(block.compile()) node_id_map[statement_block] = node.id df.add_node(node) diff --git a/src/cascade/frontend/generator/local_block.py b/src/cascade/frontend/generator/local_block.py index d8617b1..e16e973 100644 --- a/src/cascade/frontend/generator/local_block.py +++ b/src/cascade/frontend/generator/local_block.py @@ -149,8 +149,8 @@ def call_block(self, *args, **kwargs) -> Any: # def to_node(self) -> CallLocal: # return CallLocal(InvokeMethod(self.get_method_name())) - # def get_method_name(self): - # return f"{self.method_base_name}_{self.block_num}" + def get_method_name(self): + return f"{self.method_base_name}_{self.block_num}" # def get_method_signature(self) -> str: # return f'variable_map, state' diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index e49093e..d80b44f 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -13,13 +13,13 @@ from pyflink.datastream import ProcessFunction, StreamExecutionEnvironment from pyflink.datastream.output_tag import OutputTag import pickle -from cascade.dataflow.dataflow import CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod, Node +from cascade.dataflow.dataflow import CallLocal, CollectNode, DataFlow, DataflowRef, Event, EventResult, InitClass, InvokeMethod, Node from cascade.dataflow.operator import StatefulOperator, StatelessOperator from confluent_kafka import Producer, Consumer import logging logger = logging.getLogger("cascade") -logger.setLevel("INFO") +logger.setLevel("DEBUG") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) @@ -52,28 +52,51 @@ def propogate(self, event: Event, targets: list[Node], result: Any, **kwargs) -> class FanOutOperator(ProcessFunction): """""" - def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, OutputTag]) -> None: + def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, OutputTag], collect_tag: OutputTag) -> None: self.stateful_ops = stateful_ops self.stateless_ops = stateless_ops + self.collect_tag = collect_tag - def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + def process_element(self, event: Event, ctx: ProcessFunction.Context): event = profile_event(event, "FanOut") logger.debug(f"FanOut Event entered: {event._id}") if isinstance(event.target, CallLocal): - logger.debug(event) if event.dataflow.operator_name in self.stateful_ops: tag = self.stateful_ops[event.dataflow.operator_name] else: tag = self.stateless_ops[event.dataflow.operator_name] + elif isinstance(event.target, CollectNode): + tag = self.collect_tag + else: logger.error(f"FanOut: Wrong target: {event}") return logger.debug(f"Fanout Event routed to: {tag.tag_id}") yield tag, event + +class RouterOperator(ProcessFunction): + """""" + def __init__(self, dataflows: dict['DataflowRef', 'DataFlow']) -> None: + self.dataflows = dataflows + + def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction.Context): + event, result = event_result + event = profile_event(event, "FanOut") + + logger.debug(f"FanOut Event entered: {event._id}") + + new_events = list(event.propogate(result, self.dataflows)) + + if len(new_events) == 1 and isinstance(new_events[0], EventResult): + logger.debug(f"RouterOperator: Returned {new_events[0]}") + else: + logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") + + yield from new_events class FlinkOperator(KeyedProcessFunction): @@ -119,9 +142,11 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): state = self.state.value() if state is None: logger.error(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: State does not exist for key {ctx.get_current_key()}") - raise KeyError - - state = pickle.loads(state) + # raise KeyError(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: State does not exist for key {ctx.get_current_key()}") + # try to create it anyway + state = self.operator.handle_init_class(*event.variable_map).__dict__ + else: + state = pickle.loads(state) result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map, state=state) @@ -139,14 +164,15 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): # if event.target.assign_result_to is not None: # event.variable_map[event.target.assign_result_to] = result - new_events = list(event.propogate(result)) + # new_events = list(event.propogate(result, self.operator.dataflows)) - if len(new_events) == 1 and isinstance(new_events[0], EventResult): - logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Returned {new_events[0]}") - else: - logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") + # if len(new_events) == 1 and isinstance(new_events[0], EventResult): + # logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Returned {new_events[0]}") + # else: + # logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") - yield from new_events + # yield from new_events + yield (event, result) class FlinkStatelessOperator(ProcessFunction): """Wraps an `cascade.dataflow.datflow.StatefulOperator` in a KeyedProcessFunction so that it can run in Flink. @@ -168,14 +194,15 @@ def process_element(self, event: Event, ctx: ProcessFunction.Context): raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method}") - new_events = list(event.propogate(result)) + # new_events = list(event.propogate(result, self.operator.dataflows)) - if len(new_events) == 1 and isinstance(new_events[0], EventResult): - logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Returned {new_events[0]}") - else: - logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Propogated {len(new_events)} new Events") + # if len(new_events) == 1 and isinstance(new_events[0], EventResult): + # logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Returned {new_events[0]}") + # else: + # logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Propogated {len(new_events)} new Events") - yield from new_events + # yield from new_events + yield (event, result) class FlinkSelectAllOperator(KeyedProcessFunction): @@ -215,12 +242,6 @@ def process_element(self, event: Event, ctx: 'ProcessFunction.Context'): else: raise Exception(f"Unexpected target for SelectAllOperator: {event.target}") -class Result(ABC): - """A `Result` can be either `Arrived` or `NotArrived`. It is used in the - FlinkCollectOperator to determine whether all the events have completed - their computation.""" - pass - class FlinkCollectOperator(KeyedProcessFunction): """Flink implementation of a merge operator.""" def __init__(self): @@ -235,8 +256,10 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): var_map_num_items = self.var_map.value() logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processing: {event}") + + assert isinstance(event.target, CollectNode) - total_events = len(event.dataflow.get_dataflow().get_predecessors(event.target)) + total_events = event.target.num_events # Add to the map if var_map_num_items == None: @@ -254,7 +277,8 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): if num_items == total_events: logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Yielding collection") event.variable_map = combined_var_map - yield from event.propogate(None) + # yield from event.propogate(None) + yield (event, None) self.var_map.clear() else: self.var_map.update((combined_var_map, num_items)) @@ -371,6 +395,8 @@ def __init__(self, input_topic="input-topic", output_topic="output-topic", ui_po self.stateful_operators: list[FlinkOperator] = [] """List of stateful operator streams, which gets appended at `add_operator`.""" + self.dataflows: dict['DataflowRef', 'DataFlow'] = {} + def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, parallelism=None, thread_mode=False): """Initialise & configure the Flink runtime. @@ -429,10 +455,11 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_string("pipeline.jars",f"file://{flink_jar};file://{kafka_jar};file://{serializer_jar}") self.env = StreamExecutionEnvironment.get_execution_environment(config) - if parallelism: - self.env.set_parallelism(parallelism) - parallelism = self.env.get_parallelism() - logger.debug(f"FlinkRuntime: parellelism {parallelism}") + if not parallelism: + parallelism = min(self.env.get_parallelism(), 16) + self.env.set_parallelism(parallelism) + + logger.debug(f"FlinkRuntime: parallelism {parallelism}") deserialization_schema = ByteSerializer() @@ -537,12 +564,18 @@ def add_operator(self, op: StatefulOperator): flink_op = FlinkOperator(op) self.stateful_operators.append(flink_op) + self.dataflows.update(op.dataflows) def add_stateless_operator(self, op: StatelessOperator): """Add a `FlinkStatelessOperator` to the Flink datastream.""" flink_op = FlinkStatelessOperator(op) self.stateless_operators.append(flink_op) + self.dataflows.update(op.dataflows) + + def add_dataflow(self, dataflow: DataFlow): + """When adding extra dataflows, e.g. when testing or for optimized versions""" + self.dataflows[dataflow.ref()] = dataflow def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="kafka") -> Union[CloseableIterator, None]: @@ -557,9 +590,10 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka # create the fanout operator stateful_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateful_operators} stateless_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateless_operators} + collect_tag = OutputTag("__COLLECT__") logger.debug(f"Stateful tags: {stateful_tags.items()}") logger.debug(f"Stateless tags: {stateless_tags.items()}") - fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags)).name("FANOUT OPERATOR").disable_chaining() + fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags, collect_tag)).name("FANOUT OPERATOR").disable_chaining() # create the streams self.stateful_op_streams = [] @@ -597,8 +631,9 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka else: raise RuntimeError("No operators found, were they added to the flink runtime with .add_*_operator()") - merge_op_stream = ( - operator_streams.filter(lambda e: isinstance(e, Event) and isinstance(e.target, CollectNode)) + collect_stream = ( + fanout + .get_side_output(collect_tag) .key_by(lambda e: e._id) # might not work in the future if we have multiple merges in one dataflow? .process(FlinkCollectOperator()) .name("Collect") @@ -606,8 +641,9 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" # union with EventResults or Events that don't have a CollectNode target - ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode)))).map(lambda e: profile_event(e, "MERGE UNION")) + ds = collect_stream.union(operator_streams) + ds = ds.process(RouterOperator(self.dataflows)).name("ROUTER") # Output the stream results = ( diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index 85126e2..a19269c 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -1,5 +1,6 @@ import threading from typing import List, Union +import cascade from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod from queue import Empty, Queue @@ -31,7 +32,7 @@ def process(self, event: Event): ) self.states[key] = state - new_events = event.propogate(result) + new_events = event.propogate(result, cascade.core.dataflows) if isinstance(new_events, EventResult): yield new_events else: @@ -54,7 +55,7 @@ def process(self, event: Event): else: raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method}") - new_events = event.propogate(result) + new_events = event.propogate(result, cascade.core.dataflows) if isinstance(new_events, EventResult): yield new_events else: @@ -71,7 +72,8 @@ def process(self, event: Event): else: self.state[key].append(event) - n = len(event.dataflow.get_dataflow().get_predecessors(event.target)) + assert isinstance(event.target, CollectNode) + n = event.target.num_events print(f"PythonCollectOperator: collected {len(self.state[key])}/{n} for event {event._id}") if len(self.state[key]) == n: @@ -80,7 +82,7 @@ def process(self, event: Event): var_map.update(event.variable_map) event.variable_map = var_map - new_events = event.propogate(None) + new_events = event.propogate(None, cascade.core.dataflows) if isinstance(new_events, EventResult): yield new_events else: diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 1bc3df6..09940e4 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -68,7 +68,7 @@ def add(x: int, y: int): df = sf.build(dataflows, "Test") assert len(df.blocks) == 1 - assert list(df.blocks.values())[0].compile().call_block({"x_0": 3, "y_0":5 }, None) == 8 + assert list(df.blocks.values())[0].call_block({"x_0": 3, "y_0":5 }, None) == 8 def test_state(): @@ -97,8 +97,8 @@ def buy_item(self, item: 'Item') -> bool: blocks = list(df.blocks.values()) assert len(blocks) == 1 - func = blocks[0].compile().call_block - print(blocks[0].to_string()) + func = blocks[0].call_block + print(blocks[0].function_string) @dataclass class User: @@ -143,9 +143,9 @@ class ComposeReview: req_id: str review_data: dict - func = blocks[0].compile().call_block + func = blocks[0].call_block - print(blocks[0].to_string()) + print(blocks[0].function_string) compose_review = ComposeReview("req", {}) func({"review_id_0": 123}, compose_review.__dict__) diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index 893ec81..c5a68dc 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -43,7 +43,7 @@ def get_total(item1: Stock, item2: Stock, y: int): df = sf.build_df(dataflows, "Test") print(df.to_dot()) for block in df.blocks.values(): - print(block.to_string()) + print(block.function_string) # TODO: Check # entity calls, # of local calls assert len(df.nodes) == 5 @@ -86,7 +86,7 @@ def test_branching(self) -> int: df = sf.build_df(dataflows, "Test") print(df.to_dot()) for block in df.blocks.values(): - print(block.to_string()) + print(block.function_string) assert len(df.nodes) == 5 assert len(df.blocks) == 4 @@ -144,7 +144,7 @@ def test_branching(self) -> int: df = sf.build_df(dataflows, "Test") print(df.to_dot()) for block in df.blocks.values(): - print(block.to_string()) + print(block.function_string) assert len(df.nodes) == 7 assert len(df.blocks) == 5 diff --git a/tests/integration/branching.py b/tests/integration/branching.py index 642c755..e4fa373 100644 --- a/tests/integration/branching.py +++ b/tests/integration/branching.py @@ -11,6 +11,14 @@ def branch(cond: bool) -> int: else: return 42 + @staticmethod + def branch_insta(cond: bool) -> int: + if cond: + r = Remote.get() + return r + else: + return 42 + @cascade.cascade class Remote: diff --git a/tests/integration/flink/test_branching.py b/tests/integration/flink/test_branching.py index ccffb16..4d8760d 100644 --- a/tests/integration/flink/test_branching.py +++ b/tests/integration/flink/test_branching.py @@ -4,6 +4,7 @@ from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize +from cascade.runtime.flink_runtime import FlinkClientSync import tests.integration.flink.utils as utils from tests.integration.flink.utils import wait_for_event_id import pytest @@ -18,7 +19,8 @@ def test_branching_pyflink(): utils.create_topics() - runtime, client = utils.init_flink_runtime("tests.integration.branching") + runtime = utils.init_flink_runtime("tests.integration.branching") + client = FlinkClientSync() collector = runtime.run(run_async=True, output="collect") assert isinstance(collector, CloseableIterator) diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py index 113ab66..ba80e1e 100644 --- a/tests/integration/flink/test_collect_operator.py +++ b/tests/integration/flink/test_collect_operator.py @@ -4,6 +4,7 @@ from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize +from cascade.runtime.flink_runtime import FlinkClientSync import tests.integration.flink.utils as utils from tests.integration.flink.utils import wait_for_event_id import pytest @@ -18,8 +19,21 @@ def test_collect_operator(): utils.create_topics() - runtime, client = utils.init_flink_runtime("tests.integration.common") + runtime = utils.init_flink_runtime("tests.integration.common") + + client = FlinkClientSync() + + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + + df_parallel = parallelize(user_buy_2) + df_parallel.name = "buy_2_parallel" + cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel + print(df_parallel.to_dot()) + runtime.add_dataflow(df_parallel) + assert len(df_parallel.entry) == 2 + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) try: @@ -37,12 +51,7 @@ def _test_collect_operator(client, collector): item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] - - df_parallel = parallelize(user_buy_2) - df_parallel.name = "buy_2_parallel" - cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel - print(df_parallel.to_dot()) - assert len(df_parallel.entry) == 2 + df_parallel = cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") diff --git a/tests/integration/flink/test_operators.py b/tests/integration/flink/test_operators.py index 8c92616..4886cd0 100644 --- a/tests/integration/flink/test_operators.py +++ b/tests/integration/flink/test_operators.py @@ -2,6 +2,7 @@ from pyflink.datastream.data_stream import CloseableIterator from cascade.dataflow.dataflow import DataflowRef, Event +from cascade.runtime.flink_runtime import FlinkClientSync import tests.integration.flink.utils as utils from tests.integration.flink.utils import wait_for_event_id @@ -17,7 +18,8 @@ def test_stateful_operator(): utils.create_topics() - runtime, client = utils.init_flink_runtime("tests.integration.common") + runtime = utils.init_flink_runtime("tests.integration.common") + client = FlinkClientSync() collector = runtime.run(run_async=True, output="collect") assert isinstance(collector, CloseableIterator) @@ -77,7 +79,8 @@ def test_stateless_operator(): utils.create_topics() - runtime, client = utils.init_flink_runtime("tests.integration.stateless") + runtime = utils.init_flink_runtime("tests.integration.stateless") + client = FlinkClientSync() collector = runtime.run(run_async=True, output="collect") assert isinstance(collector, CloseableIterator) diff --git a/tests/integration/flink/utils.py b/tests/integration/flink/utils.py index b0e805b..e0406d0 100644 --- a/tests/integration/flink/utils.py +++ b/tests/integration/flink/utils.py @@ -19,11 +19,22 @@ def wait_for_event_id(id: int, collector: CloseableIterator) -> EventResult: return record -def init_flink_runtime(import_path: str) -> tuple[FlinkRuntime, FlinkClientSync]: +def init_cascade_from_module(import_path: str): cascade.core.clear() exec(f'import {import_path}') cascade.core.init() - runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) + +def init_flink_runtime(import_path: str, in_topic=None, out_topic=None, internal_topic=None, parallelism=1, **init_args) -> FlinkRuntime: + init_cascade_from_module(import_path) + + if in_topic is None: + in_topic = IN_TOPIC + if out_topic is None: + out_topic = OUT_TOPIC + if internal_topic is None: + internal_topic = INTERNAL_TOPIC + + runtime = FlinkRuntime(in_topic, out_topic, internal_topic=internal_topic) for op in cascade.core.operators.values(): if isinstance(op, StatefulOperator): @@ -31,8 +42,8 @@ def init_flink_runtime(import_path: str) -> tuple[FlinkRuntime, FlinkClientSync] elif isinstance(op, StatelessOperator): runtime.add_stateless_operator(op) - runtime.init(parallelism=4) - return runtime, FlinkClientSync() + runtime.init(parallelism=parallelism, **init_args) + return runtime def create_topics(*required_topics): if len(required_topics) == 0: diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index c11fa43..5d5691b 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -96,6 +96,17 @@ def test_branching_integration(): result = client.send(event) assert result == 33 + event = branch.generate_event({"cond_0": False}) + result = client.send(event) + assert result == 42 + + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch_insta")] + print(branch.to_dot()) + + event = branch.generate_event({"cond_0": True}) + result = client.send(event) + assert result == 33 + event = branch.generate_event({"cond_0": False}) result = client.send(event) assert result == 42 \ No newline at end of file From fae5b28f6efe7bdd2b04f673bf43c6d662fc7ef7 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 8 Apr 2025 20:01:52 +0200 Subject: [PATCH 27/37] Experimentally try thread mode --- deathstar_movie_review/demo.py | 2 +- run_experiments_gil_workaround.py | 4 ++-- src/cascade/dataflow/dataflow.py | 2 -- src/cascade/runtime/flink_runtime.py | 17 ++++++++++------- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 0cfbf28..076bb27 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -21,7 +21,7 @@ def main(): create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) - runtime = init_flink_runtime("deathstar_movie_review.entities.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=False) + runtime = init_flink_runtime("deathstar_movie_review.entities.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True) print(f"Creating dataflow [{EXPERIMENT}]") diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index d503a53..7c4ae88 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -33,7 +33,7 @@ def mps(num, producer_threads=1): # {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(400, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(600, producer_threads=20)}}, - {"parallelism": 1, "benchmark_args": {**mps(10, producer_threads=1)}}, + {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=40)}}, # {"parallelism": 24, "benchmark_args": {**mps(1000, threads=20)}}, ] @@ -44,7 +44,7 @@ def mps(num, producer_threads=1): print("Tearing down docker containers") subprocess.run(["docker", "compose", "down"], check=False) -for e in ["parallel"]: +for e in ["baseline", "parallel"]: for exp in experiments: print(f"Starting experiment {exp}") diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 2e38cc5..df4c7ec 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -116,8 +116,6 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di new_key = None df = df_map[self.dataflow] new_targets = df.entry - if not isinstance(new_targets, list): - new_targets = [new_targets] # Tail call elimination: # "targets" corresponds to where to go after this CallEntity finishes diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index d80b44f..f4e47c9 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -19,7 +19,7 @@ import logging logger = logging.getLogger("cascade") -logger.setLevel("DEBUG") +logger.setLevel("INFO") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) @@ -85,9 +85,9 @@ def __init__(self, dataflows: dict['DataflowRef', 'DataFlow']) -> None: def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction.Context): event, result = event_result - event = profile_event(event, "FanOut") + event = profile_event(event, "Router") - logger.debug(f"FanOut Event entered: {event._id}") + logger.debug(f"RouterOperator Event entered: {event._id}") new_events = list(event.propogate(result, self.dataflows)) @@ -123,7 +123,7 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): assert(key is not None) if isinstance(event.target.method, InitClass): - result = self.operator.handle_init_class(**event.variable_map) + result = self.operator.handle_init_class(**event.variable_map).__dict__ # Register the created key in FlinkSelectAllOperator if SELECT_ALL_ENABLED: @@ -136,7 +136,7 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Registering key: {register_key_event}") yield register_key_event - self.state.update(pickle.dumps(result.__dict__)) + self.state.update(pickle.dumps(result)) elif isinstance(event.target.method, InvokeMethod): state = self.state.value() @@ -593,7 +593,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka collect_tag = OutputTag("__COLLECT__") logger.debug(f"Stateful tags: {stateful_tags.items()}") logger.debug(f"Stateless tags: {stateless_tags.items()}") - fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags, collect_tag)).name("FANOUT OPERATOR").disable_chaining() + fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags, collect_tag)).name("FANOUT OPERATOR")#.disable_chaining() # create the streams self.stateful_op_streams = [] @@ -605,6 +605,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .key_by(lambda e: e.key) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.name()) + .process(RouterOperator(self.dataflows)).name("ROUTER") ) self.stateful_op_streams.append(op_stream) @@ -616,6 +617,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .get_side_output(tag) .process(flink_op) .name("STATELESS OP: " + flink_op.operator.name()) + .process(RouterOperator(self.dataflows)).name("ROUTER") ) self.stateless_op_streams.append(op_stream) @@ -637,13 +639,14 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .key_by(lambda e: e._id) # might not work in the future if we have multiple merges in one dataflow? .process(FlinkCollectOperator()) .name("Collect") + .process(RouterOperator(self.dataflows)).name("ROUTER") ) """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" # union with EventResults or Events that don't have a CollectNode target ds = collect_stream.union(operator_streams) - ds = ds.process(RouterOperator(self.dataflows)).name("ROUTER") + # ds = ds.process(RouterOperator(self.dataflows)).name("ROUTER") # Output the stream results = ( From 7d7dc12458cfbd7d4a4277fb128bdd01bf215b32 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Wed, 9 Apr 2025 12:18:54 +0200 Subject: [PATCH 28/37] Fix parallelism issues on experiments --- run_experiments_gil_workaround.py | 4 ++-- src/cascade/runtime/flink_runtime.py | 36 +++++++++++++++++++++------- tests/integration/flink/utils.py | 2 +- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index 7c4ae88..f6460dd 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -14,7 +14,7 @@ def mps(num, producer_threads=1): return { "threads": producer_threads, "requests_per_second": num, - "seconds": 100, + "seconds": 50, } @@ -33,7 +33,7 @@ def mps(num, producer_threads=1): # {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(400, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(600, producer_threads=20)}}, - {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=30)}}, # {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=40)}}, # {"parallelism": 24, "benchmark_args": {**mps(1000, threads=20)}}, ] diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index f4e47c9..e3e2e4b 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -97,6 +97,21 @@ def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction. logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") yield from new_events + +def router_flat_map(event_result: tuple[Event, Any], dataflows: dict['DataflowRef', 'DataFlow']): + event, result = event_result + event = profile_event(event, "Router") + + # logger.debug(f"RouterOperator Event entered: {event._id}") + + new_events = list(event.propogate(result, dataflows)) + + # if len(new_events) == 1 and isinstance(new_events[0], EventResult): + # logger.debug(f"RouterOperator: Returned {new_events[0]}") + # else: + # logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") + + return new_events class FlinkOperator(KeyedProcessFunction): @@ -455,11 +470,11 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_string("pipeline.jars",f"file://{flink_jar};file://{kafka_jar};file://{serializer_jar}") self.env = StreamExecutionEnvironment.get_execution_environment(config) - if not parallelism: - parallelism = min(self.env.get_parallelism(), 16) - self.env.set_parallelism(parallelism) + if parallelism: + self.env.set_parallelism(parallelism) + parallelism = self.env.get_parallelism() - logger.debug(f"FlinkRuntime: parallelism {parallelism}") + logger.info(f"FlinkRuntime: parallelism {parallelism}") deserialization_schema = ByteSerializer() @@ -605,7 +620,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .key_by(lambda e: e.key) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.name()) - .process(RouterOperator(self.dataflows)).name("ROUTER") + # .process(RouterOperator(self.dataflows)).name("ROUTER") ) self.stateful_op_streams.append(op_stream) @@ -617,7 +632,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .get_side_output(tag) .process(flink_op) .name("STATELESS OP: " + flink_op.operator.name()) - .process(RouterOperator(self.dataflows)).name("ROUTER") + # .process(RouterOperator(self.dataflows)).name("ROUTER") ) self.stateless_op_streams.append(op_stream) @@ -639,14 +654,17 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .key_by(lambda e: e._id) # might not work in the future if we have multiple merges in one dataflow? .process(FlinkCollectOperator()) .name("Collect") - .process(RouterOperator(self.dataflows)).name("ROUTER") + # .process(RouterOperator(self.dataflows)).name("ROUTER") ) """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" + # descriptor = ValueStateDescriptor("dataflows", Types.PICKLED_BYTE_ARRAY()) + + # broadcast_dataflows = self.env.broadcast_variable("dataflows", list(self.dataflows.items())) # union with EventResults or Events that don't have a CollectNode target - ds = collect_stream.union(operator_streams) + ds = collect_stream.union(operator_streams)#.flat_map(lambda x: router_flat_map(x, {u: v for u, v in self.dataflows.items()})) - # ds = ds.process(RouterOperator(self.dataflows)).name("ROUTER") + ds = ds.process(RouterOperator(self.dataflows)).name("ROUTER") # Output the stream results = ( diff --git a/tests/integration/flink/utils.py b/tests/integration/flink/utils.py index e0406d0..447212d 100644 --- a/tests/integration/flink/utils.py +++ b/tests/integration/flink/utils.py @@ -24,7 +24,7 @@ def init_cascade_from_module(import_path: str): exec(f'import {import_path}') cascade.core.init() -def init_flink_runtime(import_path: str, in_topic=None, out_topic=None, internal_topic=None, parallelism=1, **init_args) -> FlinkRuntime: +def init_flink_runtime(import_path: str, in_topic=None, out_topic=None, internal_topic=None, parallelism=None, **init_args) -> FlinkRuntime: init_cascade_from_module(import_path) if in_topic is None: From 9c6b6f7fa432da2fecb8039e04bcb95cc3a85125 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:59:25 +0200 Subject: [PATCH 29/37] Add some support for external libraries --- deathstar_movie_review/demo.py | 2 +- deathstar_movie_review/entities/entities.py | 27 +++---- .../test_movie_review_demo.py | 9 ++- experiments/dynamic_prefetching/entities.py | 30 ++++++++ .../dynamic_prefetching/run_prefetcher.py | 71 +++++++++++++++++++ run_experiments_gil_workaround.py | 2 +- src/cascade/core.py | 51 +++++++------ src/cascade/descriptors/class_descriptor.py | 7 +- src/cascade/frontend/cfg/cfg_builder.py | 11 +-- .../frontend/generator/dataflow_builder.py | 9 ++- src/cascade/frontend/generator/local_block.py | 7 +- src/cascade/frontend/generator/unparser.py | 18 ++++- .../test_dataflow_graph_builder.py | 6 +- .../dataflow_analysis/test_entities.py | 33 +++++++++ tests/integration/flink/test_operators.py | 4 +- tests/integration/flink/utils.py | 2 +- 16 files changed, 230 insertions(+), 59 deletions(-) create mode 100644 experiments/dynamic_prefetching/entities.py create mode 100644 experiments/dynamic_prefetching/run_prefetcher.py diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 076bb27..093308e 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -21,7 +21,7 @@ def main(): create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) - runtime = init_flink_runtime("deathstar_movie_review.entities.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True) + runtime = init_flink_runtime("deathstar_movie_review.entities.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True, parallelism=None) print(f"Creating dataflow [{EXPERIMENT}]") diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py index a09c8d7..3079dc6 100644 --- a/deathstar_movie_review/entities/entities.py +++ b/deathstar_movie_review/entities/entities.py @@ -45,12 +45,12 @@ def __init__(self, title: str, movie_id: str): self.movie_id = movie_id def upload_movie(self, review: ComposeReview, rating: int): - # if self.movie_id is not None: - # review.upload_movie_id(self.movie_id) - # else: - # review.upload_rating(rating) - movie_id = self.movie_id - review.upload_movie_id(movie_id) + cond = self.movie_id is not None + if cond: + movie_id = self.movie_id + review.upload_movie_id(movie_id) + else: + review.upload_rating(rating) @cascade class Frontend(): @@ -66,19 +66,20 @@ def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text # uuid = UniqueId.generate() # review.upload_unique_id(uuid) -@cascade + +class Uuid: + @staticmethod + def gen_uuid(): + return uuid.uuid1().int >> 64 + +@cascade(globals={'Uuid': Uuid}) class UniqueId(): @staticmethod def upload_unique_id_2(review: ComposeReview): # TODO: support external libraries - # review_id = uuid.uuid1().int >> 64 - review_id = 424242 + review_id = Uuid.gen_uuid() review.upload_unique_id(review_id) - @staticmethod - def generate() -> int: - return 424242 - @cascade class Text(): @staticmethod diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 81be421..1fceeb3 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -34,6 +34,13 @@ def test_deathstar_movie_demo_python(): exec(f'import deathstar_movie_review.entities.entities') cascade.core.init() + compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel = parallelize(compose_df) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 4 + runtime, client = init_python_runtime() deathstar_movie_demo(client) @@ -84,7 +91,7 @@ def deathstar_movie_demo(client): event = cascade.core.dataflows[DataflowRef("User", "__init__")].generate_event({"username": username, "user_data": user_data}, username) result = client.send(event, block=True) print(result) - assert result.username == username + assert result['username'] == username print("testing compose review") req_id = "4242" diff --git a/experiments/dynamic_prefetching/entities.py b/experiments/dynamic_prefetching/entities.py new file mode 100644 index 0000000..77b6919 --- /dev/null +++ b/experiments/dynamic_prefetching/entities.py @@ -0,0 +1,30 @@ +from cascade import cascade +import random + +@cascade +class Oracle(): + @staticmethod + def get() -> int: + return 42 + +@cascade(globals={'random': random}) +class Prefetcher: + @staticmethod + def prefetch(branch_chance: float): + prefetched_value = Oracle.get() + rand = random.random() + cond = rand < branch_chance + if cond: + return prefetched_value + else: + return -42 + + @staticmethod + def baseline(branch_chance: float): + cond = random.random() < branch_chance + if cond: + value = Oracle.get() + return value + else: + return -42 + diff --git a/experiments/dynamic_prefetching/run_prefetcher.py b/experiments/dynamic_prefetching/run_prefetcher.py new file mode 100644 index 0000000..30fd9e6 --- /dev/null +++ b/experiments/dynamic_prefetching/run_prefetcher.py @@ -0,0 +1,71 @@ +import logging +import sys +import os + + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) + +import cascade +from cascade.runtime.flink_runtime import FlinkClientSync +from cascade.dataflow.dataflow import DataflowRef +from tests.integration.flink.utils import create_topics, init_flink_runtime, wait_for_event_id +from pyflink.datastream.data_stream import CloseableIterator + + +KAFKA_BROKER = "localhost:9092" +KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different + +IN_TOPIC = "ds-movie-in" +OUT_TOPIC = "ds-movie-out" +INTERNAL_TOPIC = "ds-movie-internal" + + + +def main(): + create_topics() + + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + runtime = init_flink_runtime("experiments.dynamic_prefetching.entities", parallelism=4) + + print(cascade.core.dataflows.keys()) + client = FlinkClientSync() + + runtime.run(run_async=True) + # assert isinstance(collector, CloseableIterator) + + + try: + run_test(client) + finally: + client.close() + +import time +def run_test(client): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] + prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] + + for block in baseline.blocks.values(): + print(block.function_string) + + for block in prefetch.blocks.values(): + print(block.function_string) + + event = baseline.generate_event({"branch_chance_0": 0.0}) + print(event) + result = client.send(event, block=True) + print(result) + + # for _ in range(10): + # event = baseline.generate_event({"branch_chance_0": 0.5}) + # client.send(event) + # result = wait_for_event_id(event[0]._id, collector) + # print(result.result) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index f6460dd..d3060ee 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -33,7 +33,7 @@ def mps(num, producer_threads=1): # {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(400, producer_threads=10)}}, # {"parallelism": 24, "benchmark_args": {**mps(600, producer_threads=20)}}, - {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=30)}}, + {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=20)}}, # {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=40)}}, # {"parallelism": 24, "benchmark_args": {**mps(1000, threads=20)}}, ] diff --git a/src/cascade/core.py b/src/cascade/core.py index 95bd84f..8578ac1 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -26,29 +26,36 @@ def setup_cfg(code: str) -> Cfg: operators: dict[str, Operator] = {} dataflows: dict[DataflowRef, DataFlow] = {} -def cascade(cls, parse_file=True): - if not isclass(cls): - raise AttributeError(f"Expected a class but got an {cls}.") - - # Parse source. - if parse_file: - class_file_name = getfile(cls) - if class_file_name not in parse_cache: - with open(class_file_name, "r") as file: - to_parse_file = file.read() - # parsed_cls = AstBuilder().string_build(to_parse_file) - parsed_cls, tree = setup_cfg(to_parse_file) - parse_cache[class_file_name] = (parsed_cls, tree) +def cascade(cls=None, *, parse_file=True, globals=None): + + def decorator(cls): + if not isclass(cls): + raise AttributeError(f"Expected a class but got an {cls}.") + + # Parse source. + if parse_file: + class_file_name = getfile(cls) + if class_file_name not in parse_cache: + with open(class_file_name, "r") as file: + to_parse_file = file.read() + # parsed_cls = AstBuilder().string_build(to_parse_file) + parsed_cls, tree = setup_cfg(to_parse_file) + parse_cache[class_file_name] = (parsed_cls, tree) + else: + parsed_cls, tree = parse_cache[class_file_name] else: - parsed_cls, tree = parse_cache[class_file_name] - else: - class_source = getsource(cls) - parsed_cls, tree = setup_cfg(class_source) + class_source = getsource(cls) + parsed_cls, tree = setup_cfg(class_source) + + # Create class descripter for class + class_desc: ClassDescriptor = ClassDescriptor.from_module(cls.__name__, tree, globals) + class_wrapper: ClassWrapper = ClassWrapper(cls, class_desc) + registered_classes.append(class_wrapper) - # Create class descripter for class - class_desc: ClassDescriptor = ClassDescriptor.from_module(cls.__name__, tree) - class_wrapper: ClassWrapper = ClassWrapper(cls, class_desc) - registered_classes.append(class_wrapper) + # Support both @cascade and @cascade(globals={...}) + if cls is None: + return decorator + return decorator(cls) def init(): @@ -88,7 +95,7 @@ def init(): df.entry = [n0] blocks = [] else: - df = DataflowBuilder(method.method_node).build(dataflows, op_name) + df = DataflowBuilder(method.method_node, cls.class_desc.globals).build(dataflows, op_name) dataflows[df.ref()] = df op.dataflows[df.ref()] = df diff --git a/src/cascade/descriptors/class_descriptor.py b/src/cascade/descriptors/class_descriptor.py index 4310f15..7924b02 100644 --- a/src/cascade/descriptors/class_descriptor.py +++ b/src/cascade/descriptors/class_descriptor.py @@ -1,4 +1,5 @@ +from typing import Any, Optional from klara.core import nodes from cascade.frontend.ast_visitors import ExtractClassDefNode, ExtractMethodVisitor @@ -13,11 +14,13 @@ def __init__( module_node: nodes.Module, class_node: nodes.ClassDef, methods_dec: list[MethodDescriptor], + globals: Optional[dict[str, Any]] ): self.class_name: str = class_name self.module_node: nodes.Module = module_node self.class_node: nodes.ClassDef = class_node self.methods_dec: list[MethodDescriptor] = methods_dec + self.globals = globals self.is_stateless = True for method in methods_dec: @@ -29,8 +32,8 @@ def get_method_by_name(self, name: str): return next(m for m in self.methods_dec if m.method_name == name) @classmethod - def from_module(cls, class_name: str, module_node: nodes.Module): + def from_module(cls, class_name: str, module_node: nodes.Module, globals): class_node: nodes.ClassDef = ExtractClassDefNode.extract(module_node, class_name) method_dec: list[MethodDescriptor] = ExtractMethodVisitor.extract(class_node) - c = cls(class_name, module_node, class_node, method_dec) + c = cls(class_name, module_node, class_node, method_dec, globals) return c diff --git a/src/cascade/frontend/cfg/cfg_builder.py b/src/cascade/frontend/cfg/cfg_builder.py index 28c3c24..6e35128 100644 --- a/src/cascade/frontend/cfg/cfg_builder.py +++ b/src/cascade/frontend/cfg/cfg_builder.py @@ -7,8 +7,9 @@ class ControlFlowGraphBuilder: - def __init__(self, block_list: list): + def __init__(self, block_list: list, globals: list[str]): self.block_list: list = block_list + self.globals = globals def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: graph = ControlFlowGraph() @@ -51,7 +52,9 @@ def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: statement.values = [v.__repr__() for v in values] contains_attribute, attribute = ContainsAttributeVisitor.check_return_attribute(b) if contains_attribute: - if attribute.value.id != 'self': + if attribute.value.id in self.globals: + statement.values.remove(attribute.value.id) + elif attribute.value.id != 'self': statement.set_remote() statement.set_attribute(attribute) @@ -63,6 +66,6 @@ def construct_dataflow_graph(self) -> ControlFlowGraph: return graph @classmethod - def build(cls, block_list: list) -> ControlFlowGraph: - dataflow_graph_builder = cls(block_list) + def build(cls, block_list: list, globals: list[str]) -> ControlFlowGraph: + dataflow_graph_builder = cls(block_list, globals) return dataflow_graph_builder.construct_dataflow_graph() diff --git a/src/cascade/frontend/generator/dataflow_builder.py b/src/cascade/frontend/generator/dataflow_builder.py index a6ef184..36296ee 100644 --- a/src/cascade/frontend/generator/dataflow_builder.py +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -1,3 +1,4 @@ +from typing import Any, Optional import networkx as nx from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode @@ -184,13 +185,15 @@ def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: class DataflowBuilder: - def __init__(self, function_def: nodes.FunctionDef): + def __init__(self, function_def: nodes.FunctionDef, globals: Optional[dict[str, Any]] = None): self.function_def = function_def self.name = self.function_def.name + self.globals = globals def build_cfg(self): - cfg: ControlFlowGraph = ControlFlowGraphBuilder.build([self.function_def] + self.function_def.body) + global_names = list(self.globals.keys()) if self.globals else [] + cfg: ControlFlowGraph = ControlFlowGraphBuilder.build([self.function_def] + self.function_def.body, global_names) self.type_map = ExtractTypeVisitor.extract(self.function_def) cfg.name = self.function_def.name @@ -217,7 +220,7 @@ def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> Data assert isinstance(rawblock, nodes.Bool), type(rawblock) node = IfNode(repr(rawblock.value)) else: - block = LocalBlock(list(statement_block), self.name, block_num, op_name) + block = LocalBlock(list(statement_block), self.name, block_num, op_name, self.globals) block_num += 1 node = block.to_node() df.add_block(block.compile()) diff --git a/src/cascade/frontend/generator/local_block.py b/src/cascade/frontend/generator/local_block.py index e16e973..2dd8575 100644 --- a/src/cascade/frontend/generator/local_block.py +++ b/src/cascade/frontend/generator/local_block.py @@ -1,5 +1,5 @@ from textwrap import indent -from typing import Any, Callable, Union, TYPE_CHECKING +from typing import Any, Callable, Optional, Union, TYPE_CHECKING from cascade.frontend.cfg import Statement @@ -45,7 +45,7 @@ def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: di class LocalBlock: - def __init__(self, statements: list[Statement], method_base_name: str, block_num: int, class_name: str): + def __init__(self, statements: list[Statement], method_base_name: str, block_num: int, class_name: str, globals: Optional[dict[str, Any]]=None): assert len(statements) > 0 # A block of statements should have no remote calls assert all([not s.is_remote() for s in statements]) @@ -77,13 +77,14 @@ def __init__(self, statements: list[Statement], method_base_name: str, block_num self.reads: set[str] = reads self.writes: set[str] = writes + self.globals = globals def compile(self) -> 'CompiledLocalBlock': return CompiledLocalBlock(self) def compile_function(self) -> Callable: local_scope = {} - exec(self.to_string(), {}, local_scope) + exec(self.to_string(), self.globals, local_scope) method_name = self.get_method_name() return local_scope[method_name] diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index 06bafc9..9093a1e 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -20,9 +20,11 @@ def unparse(block: RawBasicBlock): return f'{repr(block.target)} {block.op}= {unparse(block.value)}' case nodes.Assign: target, *rest = block.targets - return f'{repr(target)} = {unparse(block.value)}' + return f'{unparse(target)} = {unparse(block.value)}' case nodes.Attribute: return f'{block.value}.{block.attr}' + case nodes.AssignName: + return repr(block) case nodes.Name: return repr(block) case nodes.BinOp: @@ -31,8 +33,14 @@ def unparse(block: RawBasicBlock): return str(block) case nodes.Const: return str(block) - case nodes.Compare: + case nodes.NameConstant: return str(block) + case nodes.Compare: + res = unparse(block.left) + for op, operand in zip(block.ops, block.comparators): + res += " {} {}".format(op, unparse(operand)) + return res + case nodes.Bool: return repr(block) case nodes.If: @@ -40,5 +48,9 @@ def unparse(block: RawBasicBlock): raise NotImplementedError(type(block), "Should have been removed in previous CFG pass") case nodes.FunctionDef: return str(block).replace('"', "'") + case nodes.Call: + return "{}{}".format(str(block.func), tuple(block.args)) + case nodes.UnaryOp: + return "{}{}".format(str(block.op), unparse(block.operand)) case _: - return str(block) + raise NotImplementedError(type(block)) diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index 2c49883..350a1c5 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -26,7 +26,7 @@ def get_total(item1: Stock, item2: Stock): # TODO: check that the produced ssa code made variables for # - item1.get_quantity() # - item2.get_quantity() - df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body) + df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body, globals=[]) for n in df.graph.nodes: print(n) for u, v in df.graph.edges: @@ -49,7 +49,7 @@ def get_total(item1: Stock, item2: Stock): # TODO: check that the produced ssa code made variables for # - item1.get_quantity() # - item2.get_quantity() - df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body) + df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body, globals=[]) print(df.graph.nodes) print(df.graph.edges) @@ -76,6 +76,6 @@ def test_branches(item1: Stock, item2: Stock): # TODO: check that the produced ssa code made variables for # - item1.get_quantity() # - item2.get_quantity() - df: ControlFlowGraph = ControlFlowGraphBuilder.build([test] + test.body) + df: ControlFlowGraph = ControlFlowGraphBuilder.build([test] + test.body, globals=[]) # print(df.graph.nodes) # print(df.graph.edges) \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 09940e4..167df81 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -150,3 +150,36 @@ class ComposeReview: compose_review = ComposeReview("req", {}) func({"review_id_0": 123}, compose_review.__dict__) assert compose_review.review_data["review_id"] == 123 + + +def test_import(): + program = dedent(""" +class Randomer: + @staticmethod + def rand(): + r = random.random() + return r +""") + + cfg: Cfg = setup_cfg(program) + blocks = cfg.block_list + user_class = blocks[2] + upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] + + import random + sf = DataflowBuilder(upload_unique, {'random': random}) + sf.build_cfg() + for node in sf.cfg.get_nodes(): + print(node) + + dataflows = { + DataflowRef("Randomer", "rand"): DataFlow("rand", "Randomer", []), + } + + df = sf.build(dataflows, "Randomer") + + for block in df.blocks.values(): + print(block.function_string) + + rands = {df.blocks['rand_0'].call_block(variable_map={}, state=None) for x in range(10)} + assert len(rands) == 10 \ No newline at end of file diff --git a/tests/integration/flink/test_operators.py b/tests/integration/flink/test_operators.py index 4886cd0..d88a73b 100644 --- a/tests/integration/flink/test_operators.py +++ b/tests/integration/flink/test_operators.py @@ -44,7 +44,7 @@ def _test_stateful_operator(client, collector): client.send(event) result = wait_for_event_id(event[0]._id, collector) - print(result.result.__dict__) + print(result.result) event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") client.send(event) @@ -53,7 +53,7 @@ def _test_stateful_operator(client, collector): client.send(event) result = wait_for_event_id(event[0]._id, collector) - print(result.result.__dict__) + print(result.result) print(user_buy_2.to_dot()) diff --git a/tests/integration/flink/utils.py b/tests/integration/flink/utils.py index 447212d..5f0f5c4 100644 --- a/tests/integration/flink/utils.py +++ b/tests/integration/flink/utils.py @@ -24,7 +24,7 @@ def init_cascade_from_module(import_path: str): exec(f'import {import_path}') cascade.core.init() -def init_flink_runtime(import_path: str, in_topic=None, out_topic=None, internal_topic=None, parallelism=None, **init_args) -> FlinkRuntime: +def init_flink_runtime(import_path: str, in_topic=None, out_topic=None, internal_topic=None, parallelism=4, **init_args) -> FlinkRuntime: init_cascade_from_module(import_path) if in_topic is None: From 1c98c4cbbd6fc58e079d9902060626e5d4f3b179 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 10 Apr 2025 16:33:16 +0200 Subject: [PATCH 30/37] Add dynamic prefetch experiment --- deathstar_movie_review/start_benchmark.py | 4 +- .../test_movie_review_demo.py | 6 +- .../dynamic_prefetching/run_experiments.py | 54 +++++ .../dynamic_prefetching/run_prefetcher.py | 188 ++++++++++++++---- experiments/dynamic_prefetching/submit_job.py | 49 +++++ src/cascade/dataflow/dataflow.py | 30 ++- .../dataflow/optimization/parallelization.py | 24 ++- src/cascade/runtime/python_runtime.py | 4 +- tests/integration/flink/test_branching.py | 2 +- .../flink/test_collect_operator.py | 2 +- tests/optimizations/test_parallelize.py | 7 +- 11 files changed, 305 insertions(+), 65 deletions(-) create mode 100644 experiments/dynamic_prefetching/run_experiments.py create mode 100644 experiments/dynamic_prefetching/submit_job.py diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index 21028b4..dbbf73f 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -19,7 +19,7 @@ from tests.integration.flink.utils import init_cascade_from_module, init_flink_runtime import cascade -from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.dataflow.dataflow import DataflowRef,EventResult from cascade.runtime.flink_runtime import FlinkClientSync @@ -219,7 +219,7 @@ def main(): init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] - df_parallel = parallelize(df_baseline) + df_parallel, _ = parallelize_until_if(df_baseline) df_parallel.name = "compose_parallel" cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel print(cascade.core.dataflows.keys()) diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 1fceeb3..11e93d1 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -8,7 +8,7 @@ from cascade.runtime.flink_runtime import FlinkClientSync from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime @@ -35,7 +35,7 @@ def test_deathstar_movie_demo_python(): cascade.core.init() compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] - df_parallel = parallelize(compose_df) + df_parallel, _ = parallelize_until_if(compose_df) df_parallel.name = "compose_parallel" cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel print(df_parallel.to_dot()) @@ -54,7 +54,7 @@ def test_deathstar_movie_demo_flink(): runtime = utils.init_flink_runtime("deathstar_movie_review.entities.entities") compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] - df_parallel = parallelize(compose_df) + df_parallel, _ = parallelize_until_if(compose_df) df_parallel.name = "compose_parallel" cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel runtime.add_dataflow(df_parallel) diff --git a/experiments/dynamic_prefetching/run_experiments.py b/experiments/dynamic_prefetching/run_experiments.py new file mode 100644 index 0000000..ef35675 --- /dev/null +++ b/experiments/dynamic_prefetching/run_experiments.py @@ -0,0 +1,54 @@ +import subprocess +import time + + +# Define experiment parameters as a list of dictionaries +experiments = [ + {"parallelism": 4, "benchmark_args": {"requests_per_second": 10000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.99}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 10000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.99}}, +] + + + + +print("Tearing down docker containers") +subprocess.run(["docker", "compose", "down"], check=False) + +for exp in experiments: + print(f"Starting experiment {exp}") + + # Start docker compose + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}", "--force-recreate"], check=True, env={ + "TASK_SLOTS": "1" + }) + + time.sleep(10) + + # Run Flink job + + flink_cmd = [ + "flink", "run", "--pyFiles", "/home/lvanmol/cascade/src,/home/lvanmol/cascade", + "--pyModule", "experiments.dynamic_prefetching.submit_job", "-d", "-p", str(exp['parallelism']) + ] + subprocess.run(flink_cmd, check=True) + + # Start benchmark + # filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['requests_per_second']}.pkl" + benchmark_cmd = [ + "python", "-u", "-m", "experiments.dynamic_prefetching.run_prefetcher", + ] + + for arg, val in exp['benchmark_args'].items(): + benchmark_cmd.append(f"--{arg}") + benchmark_cmd.append(str(val)) + subprocess.run(benchmark_cmd, check=True) + + # Sleep for experiment duration + # print(f"Sleeping for {exp['sleep']} seconds...") + # time.sleep(exp['sleep']) + + # Stop docker compose + subprocess.run(["docker", "compose", "down"], check=False) + + print(f"Experiment completed.") + diff --git a/experiments/dynamic_prefetching/run_prefetcher.py b/experiments/dynamic_prefetching/run_prefetcher.py index 30fd9e6..9d83350 100644 --- a/experiments/dynamic_prefetching/run_prefetcher.py +++ b/experiments/dynamic_prefetching/run_prefetcher.py @@ -1,71 +1,185 @@ +import argparse import logging +from multiprocessing import Pool import sys import os - +from typing import Counter, Literal +import pandas as pd sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) import cascade +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.runtime.flink_runtime import FlinkClientSync -from cascade.dataflow.dataflow import DataflowRef -from tests.integration.flink.utils import create_topics, init_flink_runtime, wait_for_event_id -from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataFlow, DataflowRef, EventResult +from tests.integration.flink.utils import create_topics, init_cascade_from_module, init_flink_runtime, wait_for_event_id +from timeit import default_timer as timer KAFKA_BROKER = "localhost:9092" KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different -IN_TOPIC = "ds-movie-in" -OUT_TOPIC = "ds-movie-out" -INTERNAL_TOPIC = "ds-movie-internal" +IN_TOPIC = "prefetcher-in" +OUT_TOPIC = "prefetcher-out" +INTERNAL_TOPIC = "prefetcher-internal" def main(): - create_topics() - - logger = logging.getLogger("cascade") - logger.setLevel("DEBUG") - runtime = init_flink_runtime("experiments.dynamic_prefetching.entities", parallelism=4) + init_cascade_from_module("experiments.dynamic_prefetching.entities") - print(cascade.core.dataflows.keys()) - client = FlinkClientSync() - runtime.run(run_async=True) - # assert isinstance(collector, CloseableIterator) + # logger = logging.getLogger("cascade") + # logger.setLevel("DEBUG") + # runtime = init_flink_runtime("experiments.dynamic_prefetching.entities", parallelism=4) - try: - run_test(client) - finally: - client.close() + print(cascade.core.dataflows.keys()) -import time -def run_test(client): - logger = logging.getLogger("cascade") - logger.setLevel("DEBUG") baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] - for block in baseline.blocks.values(): - print(block.function_string) + print(baseline.to_dot()) + + par, rest = parallelize_until_if(prefetch) - for block in prefetch.blocks.values(): - print(block.function_string) + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) - event = baseline.generate_event({"branch_chance_0": 0.0}) - print(event) - result = client.send(event, block=True) - print(result) - # for _ in range(10): - # event = baseline.generate_event({"branch_chance_0": 0.5}) - # client.send(event) - # result = wait_for_event_id(event[0]._id, collector) - # print(result.result) + print(par.to_dot()) + par.name = "prefetch_parallel" + cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = par + run_test() +import time +def wait_for_futures(client: FlinkClientSync): + print("waiting") + done = False + while not done: + done = True + for event_id, fut in client._futures.items(): + result = fut["ret"] + if result is None: + done = False + time.sleep(0.5) + break + futures = client._futures + return futures + +def generate_event(exp: Literal["baseline", "prefetch"], chance: float): + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] + prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] + df = prefetch if exp == "prefetch" else baseline + + return df.generate_event({"branch_chance_0": chance}) + +def runner(args): + chance, bursts, requests_per_second, exp = args + client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + sleep_time = 0.95 / requests_per_second + + start = timer() + for b in range(bursts): + sec_start = timer() + + # send burst of messages + for i in range(requests_per_second): + + # sleep sometimes between messages + # if i % (messages_per_burst // sleeps_per_burst) == 0: + time.sleep(sleep_time) + event = generate_event(exp, chance) + client.send(event) + + client.flush() + sec_end = timer() + + # wait out the second + lps = sec_end - sec_start + if lps < 1: + time.sleep(1 - lps) + + end = timer() + avg_send_latency = (end - start) / bursts + print(f'Average send latency per burst for generator was: {avg_send_latency}') + if avg_send_latency > 1.1: + print(f'This is higher than expected (1). Maybe increase the number of threads?') + futures = wait_for_futures(client) + client.close() + return futures + +def run_test(): + logger = logging.getLogger("cascade") + logger.setLevel("INFO") + + + + parser = argparse.ArgumentParser(description="Run the benchmark and save results.") + parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") + parser.add_argument("--seconds", type=int, default=100, help="Number of seconds to benchmark for") + parser.add_argument("--threads", type=int, default=1, help="Number of concurrent threads") + parser.add_argument("--chance", type=float, default=0.5, help="Chance") + parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") + args = parser.parse_args() + + assert args.experiment in ["baseline", "prefetch"] + rps_per_thread = int(args.requests_per_second / args.threads) + print(f"{args.chance} - {args.experiment}: {args.requests_per_second} rps for {args.seconds}s") + print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") + + + func_args = [(args.chance, args.seconds,rps_per_thread,args.experiment)] + with Pool(args.threads) as p: + results = p.map(runner, func_args) + + results = {k: v for d in results for k, v in d.items()} + + count = Counter([r["ret"].result for r in results.values()]) + print(count) + to_pandas(results) + + + +def to_pandas(futures_dict): + # Prepare the data for the DataFrame + data = [] + for event_id, event_data in futures_dict.items(): + ret: EventResult = event_data.get("ret") + row = { + "event_id": event_id, + "sent": str(event_data.get("sent")), + "sent_t": event_data.get("sent_t"), + "ret": str(event_data.get("ret")), + "ret_t": event_data.get("ret_t"), + "roundtrip": ret.metadata["roundtrip"] if ret else None, + "flink_time": ret.metadata["flink_time"] if ret else None, + "deser_times": ret.metadata["deser_times"] if ret else None, + "loops": ret.metadata["loops"] if ret else None, + "latency": event_data["ret_t"][1] - event_data["sent_t"][1] if ret else None + } + data.append(row) + + # Create a DataFrame and save it as a pickle file + df = pd.DataFrame(data) + + # Multiply flink_time by 1000 to convert to milliseconds + df['flink_time'] = df['flink_time'] * 1000 + flink_time = df['flink_time'].median() + latency = df['latency'].median() + flink_prct = float(flink_time) * 100 / latency + print(f"Median latency : {latency:.2f} ms") + print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") + + return df + if __name__ == "__main__": main() \ No newline at end of file diff --git a/experiments/dynamic_prefetching/submit_job.py b/experiments/dynamic_prefetching/submit_job.py new file mode 100644 index 0000000..a2c993e --- /dev/null +++ b/experiments/dynamic_prefetching/submit_job.py @@ -0,0 +1,49 @@ +import cascade +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize_until_if +from tests.integration.flink.utils import create_topics, init_flink_runtime + + +KAFKA_BROKER = "localhost:9092" +KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different + +IN_TOPIC = "prefetcher-in" +OUT_TOPIC = "prefetcher-out" +INTERNAL_TOPIC = "prefetcher-internal" + + + +def main(): + create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) + + runtime = init_flink_runtime("experiments.dynamic_prefetching.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True, parallelism=None) + + + print(cascade.core.dataflows.keys()) + + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] + prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] + + print(baseline.to_dot()) + + par, rest = parallelize_until_if(prefetch) + + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + + + print(par.to_dot()) + par.name = "prefetch_parallel" + cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = par + + runtime.add_dataflow(par) + + runtime.run() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index df4c7ec..c7685fa 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -51,8 +51,6 @@ class IfNode(Node): def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: if_cond = event.variable_map[self.predicate_var] - print(self.predicate_var) - print(if_cond) targets = [] for edge in event.target.outgoing_edges: assert edge.if_conditional is not None @@ -218,6 +216,7 @@ def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.name: str = name self.adjacency_list: dict[int, list[int]] = {} self.nodes: dict[int, Node] = {} + self.edges: list[Edge] = [] self.entry: List[Node] = [] self.operator_name = op_name if args: @@ -228,18 +227,26 @@ def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): def ref(self) -> DataflowRef: return DataflowRef(self.operator_name, self.name) - # def get_operator(self) -> Operator: - # return cascade.core.operators[self.op_name] def add_node(self, node: Node): """Add a node to the Dataflow graph if it doesn't already exist.""" if node.id not in self.adjacency_list: + node.outgoing_edges = [] self.adjacency_list[node.id] = [] self.nodes[node.id] = node def add_block(self, block: 'CompiledLocalBlock'): self.blocks[block.get_method_name()] = block + def copy(self) -> 'DataFlow': + copy = DataFlow(self.name, self.operator_name, self.args) + for edge in self.edges: + copy.add_edge(edge) + copy.entry = self.entry + return copy + + + def add_edge(self, edge: Edge): """Add an edge to the Dataflow graph. Nodes that don't exist will be added to the graph automatically.""" self.add_node(edge.from_node) @@ -247,6 +254,7 @@ def add_edge(self, edge: Edge): if edge.to_node.id not in self.adjacency_list[edge.from_node.id]: self.adjacency_list[edge.from_node.id].append(edge.to_node.id) edge.from_node.outgoing_edges.append(edge) + self.edges.append(edge) def add_edge_refs(self, u: int, v: int, if_conditional=None): """Add an edge using node IDs""" @@ -264,6 +272,14 @@ def remove_edge(self, from_node: Node, to_node: Node): edge for edge in from_node.outgoing_edges if edge.to_node.id != to_node.id ] + # TODO: replace self.edges with a better algorithm for removal. + # probably by adding edge information (like edge.if_conditional, or future things) + # to self.adjacencylist + for i, edge in enumerate(self.edges): + if edge.from_node == from_node and edge.to_node == to_node: + break + self.edges.pop(i) + def remove_node(self, node: Node): """Remove a node from the DataFlow graph and reconnect its parents to its children.""" if node.id not in self.nodes: @@ -279,8 +295,8 @@ def remove_node(self, node: Node): # Set df entry if len(self.entry) == 1 and self.entry[0] == node: print(children) - assert len(children) == 1, "cannot remove entry node if it doesn't exactly one child" - self.entry = [self.nodes[children[0]]] + assert len(children) <= 1, "cannot remove entry node if it has more than two children" + self.entry = [self.nodes[id] for id in children] # Connect each parent to each child for parent_id in parents: @@ -316,7 +332,7 @@ def get_predecessors(self, node: Node) -> List[Node]: def to_dot(self) -> str: """Output the DataFlow graph in DOT (Graphviz) format.""" - lines = [f"digraph {self.operator_name}.{self.name} {{"] + lines = [f"digraph {self.operator_name}_{self.name} {{"] # Add nodes for node in self.nodes.values(): diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 574810e..1ce9fa8 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -181,8 +181,8 @@ """ from dataclasses import dataclass -from typing import Any -from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, Edge, Node +from typing import Any, Tuple +from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, Edge, IfNode, Node import cascade @dataclass @@ -193,7 +193,9 @@ class AnnotatedNode: import networkx as nx -def parallelize(df: DataFlow): +def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: + """Parallelize df, stopping at the first if node. + The first dataflow is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" # create the dependency graph ans = [] # since we use SSA, every variable has exactly one node that writes it @@ -208,6 +210,8 @@ def parallelize(df: DataFlow): method = df.blocks[node.method.method_name] reads = method.reads writes = method.writes + elif isinstance(node, IfNode): + break else: raise ValueError(f"unsupported node type: {type(node)}") @@ -217,7 +221,7 @@ def parallelize(df: DataFlow): graph.add_node(node.id) nodes_with_indegree_0 = set(graph.nodes) - n_map = df.nodes + n_map = df.nodes.copy() for node in ans: for read in node.reads: if read in write_nodes: @@ -234,12 +238,15 @@ def parallelize(df: DataFlow): updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] prev_node = None + rest = df.copy() + while len(nodes_with_indegree_0) > 0: # remove nodes from graph - children = [] + children = set() for node_id in nodes_with_indegree_0: - children.extend(graph.successors(node_id)) + children.update(graph.successors(node_id)) graph.remove_node(node_id) + rest.remove_node(n_map[node_id]) updated.add_node(n_map[node_id]) @@ -250,9 +257,6 @@ def parallelize(df: DataFlow): next_nodes.add(child) if len(nodes_with_indegree_0) > 1: - # TODO: maybe collect node should just infer from it's predecessors? - # like it can only have DataFlowNode predecessors - # TODO: rename DataflowNode to EntityCall collect_node = CollectNode(len(nodes_with_indegree_0)) for node_id in nodes_with_indegree_0: if prev_node: @@ -268,4 +272,4 @@ def parallelize(df: DataFlow): nodes_with_indegree_0 = next_nodes - return updated + return updated, rest diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index a19269c..9947023 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -20,8 +20,8 @@ def process(self, event: Event): print(f"PythonStatefulOperator[{self.operator.entity.__name__}[{key}]]: {event}") if isinstance(event.target.method, InitClass): - result = self.operator.handle_init_class(*event.variable_map.values()) - self.states[key] = result.__dict__ + result = self.operator.handle_init_class(*event.variable_map.values()).__dict__ + self.states[key] = result elif isinstance(event.target.method, InvokeMethod): state = self.states[key] diff --git a/tests/integration/flink/test_branching.py b/tests/integration/flink/test_branching.py index 4d8760d..62f81f4 100644 --- a/tests/integration/flink/test_branching.py +++ b/tests/integration/flink/test_branching.py @@ -2,7 +2,7 @@ from pyflink.datastream.data_stream import CloseableIterator from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.runtime.flink_runtime import FlinkClientSync import tests.integration.flink.utils as utils diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py index ba80e1e..54a29d3 100644 --- a/tests/integration/flink/test_collect_operator.py +++ b/tests/integration/flink/test_collect_operator.py @@ -2,7 +2,7 @@ from pyflink.datastream.data_stream import CloseableIterator from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.runtime.flink_runtime import FlinkClientSync import tests.integration.flink.utils as utils diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index 7ac51bf..287c256 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -7,7 +7,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime import cascade @@ -30,10 +30,13 @@ def test_parallelize(): print(df) print(df.nodes) - df_parallel = parallelize(df) + print(df.to_dot()) + df_parallel, _ = parallelize_until_if(df) df_parallel.name = "get_total_parallel" cascade.core.dataflows[DataflowRef("Test", "get_total_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 2 assert len(df.entry) == 1 From 1c8d3fd4237091af3119256e752631faaf58773b Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 10 Apr 2025 19:15:12 +0200 Subject: [PATCH 31/37] Tune prefetcher experiment --- .gitignore | 3 +- docker-compose.yml | 10 +++- experiments/dynamic_prefetching/entities.py | 6 ++- .../dynamic_prefetching/run_experiments.py | 8 ++- .../dynamic_prefetching/run_prefetcher.py | 49 +++++++++++-------- experiments/dynamic_prefetching/submit_job.py | 24 +++------ src/cascade/dataflow/dataflow.py | 1 - src/cascade/frontend/generator/unparser.py | 4 +- 8 files changed, 60 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 9a91afa..6bc2d46 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ build # Experiment artifacts *.png -*.pkl \ No newline at end of file +*.pkl +*.csv \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index e77df7c..bac450e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,4 +73,12 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} \ No newline at end of file + taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} + + deploy: + resources: + limits: + cpus: "4" + memory: "8G" + mem_limit: 8G + cpus: "4" \ No newline at end of file diff --git a/experiments/dynamic_prefetching/entities.py b/experiments/dynamic_prefetching/entities.py index 77b6919..2bd4e73 100644 --- a/experiments/dynamic_prefetching/entities.py +++ b/experiments/dynamic_prefetching/entities.py @@ -1,10 +1,12 @@ from cascade import cascade import random +import time -@cascade +@cascade(globals={'time': time}) class Oracle(): @staticmethod def get() -> int: + time.sleep(0.01) return 42 @cascade(globals={'random': random}) @@ -12,6 +14,7 @@ class Prefetcher: @staticmethod def prefetch(branch_chance: float): prefetched_value = Oracle.get() + and_also = Oracle.get() rand = random.random() cond = rand < branch_chance if cond: @@ -21,6 +24,7 @@ def prefetch(branch_chance: float): @staticmethod def baseline(branch_chance: float): + and_also = Oracle.get() cond = random.random() < branch_chance if cond: value = Oracle.get() diff --git a/experiments/dynamic_prefetching/run_experiments.py b/experiments/dynamic_prefetching/run_experiments.py index ef35675..3e64229 100644 --- a/experiments/dynamic_prefetching/run_experiments.py +++ b/experiments/dynamic_prefetching/run_experiments.py @@ -4,8 +4,12 @@ # Define experiment parameters as a list of dictionaries experiments = [ - {"parallelism": 4, "benchmark_args": {"requests_per_second": 10000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.99}}, - {"parallelism": 4, "benchmark_args": {"requests_per_second": 10000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.99}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.9}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.9}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.5}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.5}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.1}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.1}}, ] diff --git a/experiments/dynamic_prefetching/run_prefetcher.py b/experiments/dynamic_prefetching/run_prefetcher.py index 9d83350..b4a8ee6 100644 --- a/experiments/dynamic_prefetching/run_prefetcher.py +++ b/experiments/dynamic_prefetching/run_prefetcher.py @@ -24,8 +24,22 @@ OUT_TOPIC = "prefetcher-out" INTERNAL_TOPIC = "prefetcher-internal" +def gen_parallel(df): + par, rest = parallelize_until_if(df) + + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + print(par.to_dot()) + par.name = df.name + "_parallel" + return par + def main(): init_cascade_from_module("experiments.dynamic_prefetching.entities") @@ -40,22 +54,15 @@ def main(): baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] - print(baseline.to_dot()) - - par, rest = parallelize_until_if(prefetch) - # join the two dataflows - par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] - for edge in rest.edges: - par.add_edge(edge) - assert len(rest.entry) == 1 - assert len(par_exit) == 1 - par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + pre_par = gen_parallel(prefetch) + cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = pre_par + base_par = gen_parallel(baseline) + cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] = base_par - print(par.to_dot()) - par.name = "prefetch_parallel" - cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = par + print(base_par.to_dot()) + print(pre_par.to_dot()) run_test() @@ -76,7 +83,7 @@ def wait_for_futures(client: FlinkClientSync): return futures def generate_event(exp: Literal["baseline", "prefetch"], chance: float): - baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] df = prefetch if exp == "prefetch" else baseline @@ -145,7 +152,8 @@ def run_test(): count = Counter([r["ret"].result for r in results.values()]) print(count) - to_pandas(results) + df = to_pandas(results) + df.to_csv(f"{args.experiment}_{args.chance}_{args.requests_per_second}.csv") @@ -156,13 +164,8 @@ def to_pandas(futures_dict): ret: EventResult = event_data.get("ret") row = { "event_id": event_id, - "sent": str(event_data.get("sent")), - "sent_t": event_data.get("sent_t"), - "ret": str(event_data.get("ret")), - "ret_t": event_data.get("ret_t"), - "roundtrip": ret.metadata["roundtrip"] if ret else None, + "result": ret.result if ret else None, "flink_time": ret.metadata["flink_time"] if ret else None, - "deser_times": ret.metadata["deser_times"] if ret else None, "loops": ret.metadata["loops"] if ret else None, "latency": event_data["ret_t"][1] - event_data["sent_t"][1] if ret else None } @@ -179,6 +182,10 @@ def to_pandas(futures_dict): print(f"Median latency : {latency:.2f} ms") print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") + latency = df['latency'].mean() + print(f"Mean latency : {latency:.2f} ms") + + return df if __name__ == "__main__": diff --git a/experiments/dynamic_prefetching/submit_job.py b/experiments/dynamic_prefetching/submit_job.py index a2c993e..44a5982 100644 --- a/experiments/dynamic_prefetching/submit_job.py +++ b/experiments/dynamic_prefetching/submit_job.py @@ -1,6 +1,7 @@ import cascade from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.parallelization import parallelize_until_if +from experiments.dynamic_prefetching.run_prefetcher import gen_parallel from tests.integration.flink.utils import create_topics, init_flink_runtime @@ -24,25 +25,14 @@ def main(): baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] - print(baseline.to_dot()) + pre_par = gen_parallel(prefetch) + cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = pre_par + runtime.add_dataflow(pre_par) - par, rest = parallelize_until_if(prefetch) + base_par = gen_parallel(baseline) + cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] = base_par + runtime.add_dataflow(base_par) - # join the two dataflows - par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] - for edge in rest.edges: - par.add_edge(edge) - assert len(rest.entry) == 1 - assert len(par_exit) == 1 - par.add_edge_refs(par_exit[0], rest.entry[0].id, None) - - - print(par.to_dot()) - par.name = "prefetch_parallel" - cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = par - - runtime.add_dataflow(par) - runtime.run() if __name__ == "__main__": diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index c7685fa..4940eb6 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -294,7 +294,6 @@ def remove_node(self, node: Node): # Set df entry if len(self.entry) == 1 and self.entry[0] == node: - print(children) assert len(children) <= 1, "cannot remove entry node if it has more than two children" self.entry = [self.nodes[id] for id in children] diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index 9093a1e..8561a55 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -52,5 +52,7 @@ def unparse(block: RawBasicBlock): return "{}{}".format(str(block.func), tuple(block.args)) case nodes.UnaryOp: return "{}{}".format(str(block.op), unparse(block.operand)) + case nodes.Expr: + return unparse(block.value) case _: - raise NotImplementedError(type(block)) + raise NotImplementedError(f"{type(block)}: {block}") From 6a6818336ff2a7b21a8e5be2a9a8c22d3774dae9 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 10 Apr 2025 19:45:07 +0200 Subject: [PATCH 32/37] Add code motion test --- src/cascade/frontend/generator/unparser.py | 6 +- src/cascade/runtime/python_runtime.py | 2 +- tests/optimizations/code_motion_entities.py | 36 +++++++ tests/optimizations/test_parallelize.py | 102 ++++++++++++++++++++ 4 files changed, 144 insertions(+), 2 deletions(-) create mode 100644 tests/optimizations/code_motion_entities.py diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index 8561a55..0fb8659 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -40,7 +40,6 @@ def unparse(block: RawBasicBlock): for op, operand in zip(block.ops, block.comparators): res += " {} {}".format(op, unparse(operand)) return res - case nodes.Bool: return repr(block) case nodes.If: @@ -54,5 +53,10 @@ def unparse(block: RawBasicBlock): return "{}{}".format(str(block.op), unparse(block.operand)) case nodes.Expr: return unparse(block.value) + case nodes.BoolOp: + res = unparse(block.values[0]) + for v in block.values[1:]: + res += " {} {}".format(block.op, unparse(v)) + return res case _: raise NotImplementedError(f"{type(block)}: {block}") diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index 9947023..f8ee8b6 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -66,7 +66,7 @@ def __init__(self): self.state = {} def process(self, event: Event): - key = event.target.id + key = event._id if key not in self.state: self.state[key] = [event] else: diff --git a/tests/optimizations/code_motion_entities.py b/tests/optimizations/code_motion_entities.py new file mode 100644 index 0000000..49ceb69 --- /dev/null +++ b/tests/optimizations/code_motion_entities.py @@ -0,0 +1,36 @@ +from cascade import cascade + +@cascade +class Item: + def __init__(self, item: str, quantity: int, price: int): + self.item = item + self.quantity = quantity + self.price = price + + def get_quantity(self): + return self.quantity + + def get_price(self): + return self.price + + + +@cascade +class User: + def __init__(self, balance: int): + self.balance = balance + + def checkout_item(self, item: Item): + stock = item.get_quantity() + in_stock = stock > 0 + price = item.get_price() + can_buy = price <= self.balance + condition = in_stock and can_buy + if condition: + self.balance = self.balance - price + return True + else: + return False + + def get_balance(self) -> int: + return self.balance \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index 287c256..578f2d7 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -62,3 +62,105 @@ def test_parallelize(): event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}) result = client.send(event) assert result == 30 + +def gen_parallel(df): + par, rest = parallelize_until_if(df) + + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + + + print(par.to_dot()) + par.name = df.name + "_parallel" + return par + +def test_code_motion(): + cascade.core.clear() # clear cascadeds registerd classes. + assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ + Module" + # import the module + import_module_name: str = 'code_motion_entities' + exec(f'import tests.optimizations.{import_module_name}') + + cascade.core.init() + + print(cascade.core.operators) + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + checkout = cascade.core.dataflows[DataflowRef("User", "checkout_item")] + balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + + checkout_parallel = gen_parallel(checkout) + print(checkout.to_dot()) + cascade.core.dataflows[DataflowRef("User", "checkout_item_parallel")] = checkout_parallel + + print(checkout_parallel.to_dot()) + + assert len(checkout_parallel.entry) == 2 + assert len(checkout.entry) == 1 + + runtime = PythonRuntime() + runtime.add_operator(item_op) + runtime.add_operator(user_op) + runtime.run() + + client = PythonClientSync(runtime) + + event = item_init.generate_event({"item": "fork", "quantity": 10, "price": 10}, key="fork") + result = client.send(event) + print(result) + + event = item_init.generate_event({"item": "spoon", "quantity": 0, "price": 10}, key="spoon") + result = client.send(event) + print(result) + + event = item_init.generate_event({"item": "knife", "quantity": 10, "price": 100}, key="knife") + result = client.send(event) + print(result) + + event = user_init.generate_event({"balance": 50}, key="user") + result = client.send(event) + + + # buy spoon fails + event = checkout.generate_event({"item_0": "spoon"}, key="user") + result = client.send(event) + assert not result + + event = checkout_parallel.generate_event({"item_0": "spoon"}, key="user") + result = client.send(event) + assert not result + + + # buy knife fails + event = checkout.generate_event({"item_0": "knife"}, key="user") + result = client.send(event) + assert not result + + event = checkout_parallel.generate_event({"item_0": "knife"}, key="user") + result = client.send(event) + assert not result + + + # buy fork works! + event = checkout.generate_event({"item_0": "fork"}, key="user") + result = client.send(event) + assert result + + event = checkout_parallel.generate_event({"item_0": "fork"}, key="user") + result = client.send(event) + assert result + + event = balance.generate_event({}, key="user") + result = client.send(event) + assert result == 30 + + + From 1db6856a14b3176f89948b2b4104fc7bb0836ff1 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:41:12 +0200 Subject: [PATCH 33/37] Fix parallelize for deathstar bench --- .gitignore | 6 +- .vscode/launch.json | 17 -- .vscode/settings.json | 8 - deathstar_movie_review/demo.py | 4 +- deathstar_movie_review/start_benchmark.py | 54 ++++- run_experiments_gil_workaround.py | 36 +--- src/cascade/dataflow/dataflow.py | 56 +++-- .../dataflow/optimization/parallelization.py | 193 +----------------- src/cascade/frontend/generator/local_block.py | 6 +- src/cascade/runtime/python_runtime.py | 2 +- .../dataflow_analysis/test_entities.py | 6 +- 11 files changed, 105 insertions(+), 283 deletions(-) delete mode 100644 .vscode/launch.json delete mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index 6bc2d46..842e1e7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,11 @@ __pycache__ *.egg-info build +.vscode/ + # Experiment artifacts *.png *.pkl -*.csv \ No newline at end of file +*.csv +nohup.out +*.zip \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index bdfc6f1..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python Debugger: Current File", - "type": "debugpy", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "justMyCode": false, - - } - ] -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 544eb77..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "python.testing.pytestArgs": [ - "tests", - "-s" - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true -} \ No newline at end of file diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 093308e..b6ef32d 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -2,7 +2,7 @@ import cascade from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination -from cascade.dataflow.optimization.parallelization import parallelize +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.runtime.flink_runtime import FlinkRuntime from tests.integration.flink.utils import create_topics, init_flink_runtime @@ -26,7 +26,7 @@ def main(): print(f"Creating dataflow [{EXPERIMENT}]") df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] - df_parallel = parallelize(df_baseline) + df_parallel, _ = parallelize_until_if(df_baseline) df_parallel.name = "compose_parallel" cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel runtime.add_dataflow(df_parallel) diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index dbbf73f..0b577b2 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -192,7 +192,6 @@ def write_dict_to_pkl(futures_dict, filename): # Multiply flink_time by 1000 to convert to milliseconds df['flink_time'] = df['flink_time'] * 1000 - df.to_pickle(filename) return df def main(): @@ -219,11 +218,17 @@ def main(): init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + print(df_baseline.to_dot()) df_parallel, _ = parallelize_until_if(df_baseline) df_parallel.name = "compose_parallel" cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel print(cascade.core.dataflows.keys()) - + + for df in cascade.core.dataflows.values(): + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + if not args.no_init: print("Populating...") populate_user(init_client) @@ -262,5 +267,50 @@ def main(): print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") init_client.close() + df = preprocess(args.output, df) + df.to_pickle(args.output) + + +import re + +def preprocess(name, df, warmup_time_s=3) -> pd.DataFrame: + # Extract parallelism and mps from the name using regex + match = re.search(r'(.+)_p-(\d+)_rps-(\d+)', name) + if match: + experiment = match.group(1) + parallelism = int(match.group(2)) + mps = int(match.group(3)) + else: + raise Exception() + + # Ignore the first warmup_time seconds of events + warmup_events = int(warmup_time_s * mps) + df = df.iloc[warmup_events:] + + # Calculate the additional Kafka overhead + # df['kafka_overhead'] = df['latency'] - df['flink_time'] + + # Extract median values from df + flink_time_median = df['flink_time'].median() + latency_median = df['latency'].median() + flink_time_99_percentile = df['flink_time'].quantile(0.99) + latency_99_percentile = df['latency'].quantile(0.99) + flink_time_95_percentile = df['flink_time'].quantile(0.95) + latency_95_percentile = df['latency'].quantile(0.95) + + data = { + 'experiment': experiment, + 'parallelism': parallelism, + 'mps': mps, + 'flink_time_median': flink_time_median, + 'latency_median': latency_median, + 'latency_99_percentile': latency_99_percentile, + 'latency_95_percentile': latency_95_percentile, + 'flink_time_99_percentile': flink_time_99_percentile, + 'flink_time_95_percentile': flink_time_95_percentile + } + data = {k:[v] for k,v in data.items()} + return pd.DataFrame(data) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index d3060ee..9898d90 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -2,40 +2,22 @@ import subprocess import time -args = { - "messages_per_burst": 10, - "sleeps_per_burst": 10, - "sleep_time": 0.09, - "seconds_per_burst": 1, - "seconds": 100 -} - -def mps(num, producer_threads=1): +def rps(num, producer_threads=1): return { "threads": producer_threads, "requests_per_second": num, - "seconds": 50, + "seconds": 100, } # Define experiment parameters as a list of dictionaries experiments = [ - # {"parallelism": 4, "benchmark_args": {**mps(20)}}, - # {"parallelism": 4, "benchmark_args": {**mps(40)}}, - # {"parallelism": 4, "benchmark_args": {**mps(60)}}, - # {"parallelism": 4, "benchmark_args": {**mps(80)}}, - # {"parallelism": 4, "benchmark_args": {**mps(100)}}, - - # {"parallelism": 24, "benchmark_args": {**mps(200)}}, - # {"parallelism": 24, "benchmark_args": {**mps(400)}}, - # {"parallelism": 24, "benchmark_args": {**mps(600)}}, - # {"parallelism": 24, "benchmark_args": {**mps(800)}}, - # {"parallelism": 24, "benchmark_args": {**mps(200, producer_threads=10)}}, - # {"parallelism": 24, "benchmark_args": {**mps(400, producer_threads=10)}}, - # {"parallelism": 24, "benchmark_args": {**mps(600, producer_threads=20)}}, - {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=20)}}, - # {"parallelism": 24, "benchmark_args": {**mps(2000, producer_threads=40)}}, - # {"parallelism": 24, "benchmark_args": {**mps(1000, threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(500, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1000, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1500, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(2000, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(2500, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(3000, producer_threads=20)}}, ] @@ -66,7 +48,7 @@ def mps(num, producer_threads=1): subprocess.run(flink_cmd, check=True, env=env) # Start benchmark - filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['requests_per_second']}.pkl" + filename = f"{e}_p-{exp['parallelism']}_rps-{exp['benchmark_args']['requests_per_second']}.pkl" benchmark_cmd = [ "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename, "--experiment", e ] diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 4940eb6..4489629 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -91,7 +91,7 @@ def __hash__(self) -> int: @dataclass -class CallEntity(Node): +class CallRemote(Node): """A node in a `DataFlow` corresponding to the call of another dataflow""" dataflow: 'DataflowRef' """The dataflow to call.""" @@ -116,10 +116,10 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di new_targets = df.entry # Tail call elimination: - # "targets" corresponds to where to go after this CallEntity finishes + # "targets" corresponds to where to go after this CallRemote finishes # the call to self.dataflow # - # If this CallEntity is a terminal node in event.dataflow, then we don't + # If this CallRemote is a terminal node in event.dataflow, then we don't # need to go back to event.dataflow, so we don't add it to the call stack. # This node is terminal in event.dataflow iff len(targets) == 0 if len(targets) > 0: @@ -185,7 +185,6 @@ class Edge(): """An Edge in the Dataflow graph.""" from_node: Node to_node: Node - variable_map: dict[str, Any] = field(default_factory=dict) if_conditional: Optional[bool] = None class DataFlow: @@ -238,14 +237,6 @@ def add_node(self, node: Node): def add_block(self, block: 'CompiledLocalBlock'): self.blocks[block.get_method_name()] = block - def copy(self) -> 'DataFlow': - copy = DataFlow(self.name, self.operator_name, self.args) - for edge in self.edges: - copy.add_edge(edge) - copy.entry = self.entry - return copy - - def add_edge(self, edge: Edge): """Add an edge to the Dataflow graph. Nodes that don't exist will be added to the graph automatically.""" @@ -281,19 +272,22 @@ def remove_edge(self, from_node: Node, to_node: Node): self.edges.pop(i) def remove_node(self, node: Node): + return self.remove_node_by_id(node.id) + + def remove_node_by_id(self, node_id: int): """Remove a node from the DataFlow graph and reconnect its parents to its children.""" - if node.id not in self.nodes: + if node_id not in self.nodes: return # Node doesn't exist in the graph # Find parents (nodes that have edges pointing to this node) - parents = [parent_id for parent_id, children in self.adjacency_list.items() if node.id in children] + parents = [parent_id for parent_id, children in self.adjacency_list.items() if node_id in children] # Find children (nodes that this node points to) - children = self.adjacency_list[node.id] + children = self.adjacency_list[node_id] # Set df entry - if len(self.entry) == 1 and self.entry[0] == node: + if len(self.entry) == 1 and self.entry[0].id == node_id: assert len(children) <= 1, "cannot remove entry node if it has more than two children" self.entry = [self.nodes[id] for id in children] @@ -308,16 +302,16 @@ def remove_node(self, node: Node): # Remove edges from parents to the node for parent_id in parents: parent_node = self.nodes[parent_id] - self.remove_edge(parent_node, node) + self.remove_edge(parent_node, self.nodes[node_id]) # Remove outgoing edges from the node for child_id in children: child_node = self.nodes[child_id] - self.remove_edge(node, child_node) + self.remove_edge(self.nodes[node_id], child_node) # Remove the node from the adjacency list and nodes dictionary - del self.adjacency_list[node.id] - del self.nodes[node.id] + del self.adjacency_list[node_id] + del self.nodes[node_id] def get_neighbors(self, node: Node) -> List[Node]: @@ -338,14 +332,12 @@ def to_dot(self) -> str: lines.append(f' {node.id} [label="{node}"];') # Add edges - for node in self.nodes.values(): - for edge in node.outgoing_edges: - - line = f" {edge.from_node.id} -> {edge.to_node.id}" - if edge.if_conditional is not None: - line += f' [label="{edge.if_conditional}"]' - line += ";" - lines.append(line) + for edge in self.edges: + line = f" {edge.from_node.id} -> {edge.to_node.id}" + if edge.if_conditional is not None: + line += f' [label="{edge.if_conditional}"]' + line += ";" + lines.append(line) lines.append("}") return "\n".join(lines) @@ -360,7 +352,7 @@ def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None # TODO: propogate at "compile time" instead of doing this every time local_events = [] for ev in events: - if isinstance(ev.target, CallEntity) or isinstance(ev.target, IfNode): + if isinstance(ev.target, CallRemote) or isinstance(ev.target, IfNode): local_events.extend(ev.propogate(None, cascade.core.dataflows)) else: local_events.append(ev) @@ -428,7 +420,7 @@ def propogate(self, result: Any, df_map: dict['DataflowRef','DataFlow']) -> Iter events = [] - if len(targets) == 0 and not isinstance(self.target, CallEntity): + if len(targets) == 0 and not isinstance(self.target, CallRemote): if len(self.call_stack) > 0: caller = self.call_stack.pop() @@ -460,8 +452,8 @@ def propogate(self, result: Any, df_map: dict['DataflowRef','DataFlow']) -> Iter events = current_node.propogate(self, targets, result, df_map) for event in events: - if isinstance(event.target, CallEntity) or isinstance(event.target, IfNode): - # recursively propogate CallEntity events + if isinstance(event.target, CallRemote) or isinstance(event.target, IfNode): + # recursively propogate CallRemote events yield from event.propogate(None, df_map) else: yield event diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 1ce9fa8..4a675bb 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -1,188 +1,7 @@ -""" -When is it safe to parallize nodes? - --> When they don't affect each other --> The simpelest way of doing it could be to run individual dataflows in parallel -(e.g. item.get_price() can run in parallel) --> must convey that we assume no side-affects, so the actual order of execution -does not matter. could go deeper and give a spec. --> some instructions from the same dataflow could also be completed in parallel? -maybe? like ILP. but might need to think of more contrived examples/do more -advanced program analyis. - -From Control Flow to Dataflow -3. Parallelizing Memory Operations -- operations on different memory locatiosn need not be sequentialized -- circulate a set of access tokens for each variable (=split function?) - - assume that every variable denotes a unique memory location (no aliasing) - -We have to be careful about certain types of parallelization. Consider the example: - -``` -# Calculate the average item price in basket: List[Item] -n = 0 -p = 0 -for item in basket: - n += 1 - p += item.price() -return p / n -``` - -In this example we would want to parallelize the calls to item.price(). -But we have to make sure the calls to `n += 1` remains bounded to the number of -items, even though there is no explicit data dependency. - - ----- - - -There is another type of optimization we could look at. -Suppose the following: - -``` -n = self.basket_size - -prices = [item.price() for item in self.basket] -total_price = sum(prices) - -return total_price / n -``` - -In this case, the variable n is not needed in the list comprehension - unoptimized -versions would generate an extra function instead of having the line be re-ordered -into the bottom function. Instead, analyis of the variables each function needs -access to would be a way to optimize these parts! - ---> Ask Soham about this! - -from "From control flow to dataflow" - -Consider the portion of control-flow graph between a node N and its *immediate -postdominator* P. Every control-flow path starting at N ultimately ends up at P. -Suppose that there is no reference to a variable x in any node on any path between -N and P. It is clear that an access token for x that enters N may bypass this -region of the graph altogether and go directly to P. - - ----- - -"Dataflow-Based Parallelization of Control-Flow Algorithms" - -loop invariant hoisting - -``` -i = 0 -while i < n: - x = y + z - a[i] = 6 * i + x * x - i += 1 -``` - -can be transformed in - -``` -i = 0 -if i < n: - x = y + z # loop invariant 1 - t1 = x * x # loop invariant 2 - do { # do while loop needed in case the conditional has side effects - a[i] = 6 * i + t1 - i += 1 - } while i < n -``` - -this is achieved using reaching definitions analysis. In the paper: -"It is a common optimization to pull those parts of a loop body -that depend on only static datasets outside of the loop, and thus -execute these parts only once [7 , 13 , 15 , 32 ]. However, launching -new dataflow jobs for every iteration step prevents this optimiza- -tion in the case of such binary operators where only one input is -static. For example, if a static dataset is used as the build-side of -a hash join, then the system should not rebuild the hash table at -every iteration step. Labyrinth operators can keep such a hash -table in their internal states between iteration steps. This is made -possible by implementing iterations as a single cyclic dataflow -job, where the lifetimes of operators span all the steps." -Is there a similair example we could leverage for cascade? one with a "static dataset" as loop invariant? -in spark, it's up to the programmer to .cache it - - -In this paper, they also use an intermediate representation of one "basic block" per node. -A "basic block" is a sequence of instructions that always execute one after the other, -in other words contains no control flow. Control flow is defined by the edges in the -dataflow graph that connect the nodes. - -There's also a slightly different focus of this paper. The focus is not on stateful -dataflows, and obviously the application is still focused on bigdata-like applications, -not ones were latency is key issue. - - -Basic Blocks - Aho, A. V., Sethi, R., and Ullman, J. D. Compilers: principles, techniques, and -tools, vol. 2. Addison-wesley Reading, 2007. -SSA - Rastello, F. SSA-based Compiler Design. Springer Publishing Company, -Incorporated, 2016. - - ----- - -ideas from "optimization of dataflows with UDFs:" - -we are basically making a DSL (integrated with python) which would allow for optimization -of UDFs!! this optimization is inside the intermediate representation, and not directly in -the target machine (similair to Emma, which uses a functional style *but* is a DSL (does it -allow for arbitrary scala code?)) - ---- - -our program is essentially a compiler. this allows to take inspiration from existing -works on compilation (which has existed for much longer than work on dataflows (?) - -actually, dataflows were more popular initially when people didn't settle on the von Neumann architecture yet, -see e.g. Monsoon (1990s) or the original control flow to dataflow paper. the popularisation and efficiency of tools -such as drayadlinq, apache spark, apache flink has reinvigorated the attention towards dataflows). -BUT compilers are often have hardware specific optimizations, based on the hardware instruction sets, or hardware-specifics -such as optimization of register allocation, cache line considerations etc etc. -The compiler in Cascade/other cf to df systems do not necessarily have the same considerations. This is because the backend -is software rather than hardware (e.g. we use flink + kafka). Since software is generally a lot more flexible than hardware, -we can instead impose certain considerations on the execution engine (which is now software, instead of a chip) rather than -the other way around (e.g. SIMD introduced --> compiler optimizations introduced). (to be fair, compiler design has had major influences [citation needed] on CPU design, but the point is that hardware iteration -is generally slower and more expensive than software iteration). - - ---- - -for certain optimizations, cascade assumes order of any side effects (such as file IO) does not matter. -otherwise a lot of parallelization operations would become much more costly due to the necessary synchronization issues. - ---- - -other optimization: code duplication - -this would remove nodes (assumption that less nodes = faster) at the cost of more computation per node. -a common example is something like this: - -``` -cost = item.price() -if cost > 30: - shipping_discount = discount_service.get_shipping_discount() - price = cost * shipping_discount -else: - price = cost - -return price -``` - -in this case the "return price" could be duplicated accross the two branches, -such that they don't need to return back to the function body. - ---- - -other ideas: - https://en.wikipedia.org/wiki/Optimizing_compiler#Specific_techniques -""" - +import copy from dataclasses import dataclass from typing import Any, Tuple -from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, DataFlow, Edge, IfNode, Node +from cascade.dataflow.dataflow import CallRemote, CallLocal, CollectNode, DataFlow, Edge, IfNode, Node import cascade @dataclass @@ -195,14 +14,14 @@ class AnnotatedNode: import networkx as nx def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: """Parallelize df, stopping at the first if node. - The first dataflow is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" + The first dataflow returned is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" # create the dependency graph ans = [] # since we use SSA, every variable has exactly one node that writes it write_nodes = {} graph = nx.DiGraph() for node in df.nodes.values(): - if isinstance(node, CallEntity): + if isinstance(node, CallRemote): reads = set(node.variable_rename.values()) writes = {result} if (result := node.assign_result_to) else set() elif isinstance(node, CallLocal): @@ -221,7 +40,7 @@ def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: graph.add_node(node.id) nodes_with_indegree_0 = set(graph.nodes) - n_map = df.nodes.copy() + n_map = copy.deepcopy(df.nodes) for node in ans: for read in node.reads: if read in write_nodes: @@ -238,7 +57,7 @@ def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] prev_node = None - rest = df.copy() + rest = copy.deepcopy(df) while len(nodes_with_indegree_0) > 0: # remove nodes from graph diff --git a/src/cascade/frontend/generator/local_block.py b/src/cascade/frontend/generator/local_block.py index 2dd8575..69aea72 100644 --- a/src/cascade/frontend/generator/local_block.py +++ b/src/cascade/frontend/generator/local_block.py @@ -5,7 +5,7 @@ from cascade.frontend.cfg import Statement from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState from cascade.frontend.generator.unparser import unparse -from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef, InvokeMethod +from cascade.dataflow.dataflow import CallRemote, CallLocal, DataFlow, DataflowRef, InvokeMethod from klara.core.cfg import RawBasicBlock from klara.core import nodes @@ -14,7 +14,7 @@ from cascade.dataflow.operator import MethodCall, StatelessMethodCall -def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: dict[DataflowRef, DataFlow]) -> CallEntity: +def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: dict[DataflowRef, DataFlow]) -> CallRemote: """Transform a remote statement to an entity call.""" writes = statement.targets assert statement.is_remote() @@ -41,7 +41,7 @@ def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: di args.remove(operator_var) df_args = dataflows[dataflow].args - return CallEntity(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,keyby=key) + return CallRemote(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,keyby=key) class LocalBlock: diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index f8ee8b6..0f14bc0 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -2,7 +2,7 @@ from typing import List, Union import cascade from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.dataflow.dataflow import CallEntity, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod +from cascade.dataflow.dataflow import CallRemote, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod from queue import Empty, Queue import time diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 167df81..99cd82d 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -5,7 +5,7 @@ from klara.core.cfg import Cfg from klara.core import nodes -from cascade.dataflow.dataflow import CallEntity, CallLocal, DataFlow, DataflowRef +from cascade.dataflow.dataflow import CallRemote, CallLocal, DataFlow, DataflowRef from cascade.frontend.generator.dataflow_builder import DataflowBuilder from cascade.frontend.util import setup_cfg @@ -37,11 +37,11 @@ def get_total(item1: Stock, item2: Stock): assert len(df.nodes) == 3 assert len(df.entry) == 1 entry = df.entry[0] - assert isinstance(entry, CallEntity) + assert isinstance(entry, CallRemote) next = df.get_neighbors(entry) assert len(next) == 1 next = next[0] - assert isinstance(next, CallEntity) + assert isinstance(next, CallRemote) next = df.get_neighbors(next) assert len(next) == 1 next = next[0] From 96b2534898699b8f344bdb09af942a1e19828bdb Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:46:03 +0200 Subject: [PATCH 34/37] Fixed outdated tests --- tests/integration/pyruntime/test_programs.py | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index 5d5691b..faa1ce1 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -20,25 +20,25 @@ def test_checkout_item(): event = item_init.generate_event({"item_name": "fork", "price": 10}, key="fork") result = client.send(event) - assert result.price == 10 - assert result.item_name == "fork" + assert result["price"] == 10 + assert result["item_name"] == "fork" event = item_init.generate_event({"item_name": "spoon", "price": 20}, key="spoon") result = client.send(event) - assert result.price == 20 - assert result.__key__() == "spoon" + assert result["price"] == 20 + assert result["item_name"] == "spoon" event = user_init.generate_event({"username": "test", "balance": 15}, key="test") user = client.send(event) - assert user.balance == 15 - assert user.__key__() == "test" + assert user["balance"] == 15 + assert user["username"] == "test" - event = user_buy_item.generate_event({"item_0": "fork"}, key=user.__key__()) + event = user_buy_item.generate_event({"item_0": "fork"}, key=user["username"] ) result = client.send(event) assert runtime.statefuloperators["User"].states["test"]["balance"] == 5 assert result - event = user_buy_item.generate_event({"item_0": "spoon"}, key=user.__key__()) + event = user_buy_item.generate_event({"item_0": "spoon"}, key=user["username"] ) result = client.send(event) assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 assert not result @@ -59,15 +59,15 @@ def test_operator_chaining(): event = a_init.generate_event({"key": "aaa"}, key="aaa") result = client.send(event) - assert result.key == "aaa" + assert result["key"] == "aaa" event = b_init.generate_event({"key": "bbb"}, key="bbb") result = client.send(event) - assert result.key == "bbb" + assert result["key"] == "bbb" event = c_init.generate_event({"key": "ccc"}, key="ccc") result = client.send(event) - assert result.key == "ccc" + assert result["key"] == "ccc" event = c_get.generate_event({"y_0": 0}, key="ccc") result = client.send(event) From 7ac5e40512db0c20a96fcf089a89e8ec7e9cfc91 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Wed, 16 Apr 2025 14:04:55 +0200 Subject: [PATCH 35/37] Move collect operator to chained --- run_experiments_gil_workaround.py | 6 +- .../dataflow/optimization/parallelization.py | 2 + src/cascade/runtime/flink_runtime.py | 66 +++++++++---------- .../flink/test_collect_operator.py | 2 +- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround.py index 9898d90..4ab412f 100755 --- a/run_experiments_gil_workaround.py +++ b/run_experiments_gil_workaround.py @@ -17,7 +17,11 @@ def rps(num, producer_threads=1): {"parallelism": 24, "benchmark_args": {**rps(1500, producer_threads=20)}}, {"parallelism": 24, "benchmark_args": {**rps(2000, producer_threads=20)}}, {"parallelism": 24, "benchmark_args": {**rps(2500, producer_threads=20)}}, - {"parallelism": 24, "benchmark_args": {**rps(3000, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(250, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(750, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1250, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1750, producer_threads=25)}}, + {"parallelism": 24, "benchmark_args": {**rps(2250, producer_threads=25)}}, ] diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 4a675bb..ca126e4 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -39,6 +39,8 @@ def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: ans.append(AnnotatedNode(node, reads, writes)) graph.add_node(node.id) + # Add the edges in the dependency graph + # & generate the set of indegree 0 nodes nodes_with_indegree_0 = set(graph.nodes) n_map = copy.deepcopy(df.nodes) for node in ans: diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index e3e2e4b..86da1a1 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -52,10 +52,9 @@ def propogate(self, event: Event, targets: list[Node], result: Any, **kwargs) -> class FanOutOperator(ProcessFunction): """""" - def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, OutputTag], collect_tag: OutputTag) -> None: + def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, OutputTag]) -> None: self.stateful_ops = stateful_ops self.stateless_ops = stateless_ops - self.collect_tag = collect_tag def process_element(self, event: Event, ctx: ProcessFunction.Context): event = profile_event(event, "FanOut") @@ -68,9 +67,6 @@ def process_element(self, event: Event, ctx: ProcessFunction.Context): else: tag = self.stateless_ops[event.dataflow.operator_name] - elif isinstance(event.target, CollectNode): - tag = self.collect_tag - else: logger.error(f"FanOut: Wrong target: {event}") return @@ -79,9 +75,17 @@ def process_element(self, event: Event, ctx: ProcessFunction.Context): yield tag, event class RouterOperator(ProcessFunction): - """""" - def __init__(self, dataflows: dict['DataflowRef', 'DataFlow']) -> None: + """Takes in an Event and Result as tuple. Calls Event.propogate on the event. + + The main output contains Events to be reingested into the system. + There are two side outputs: + - one for Events with a CollectNode target + - one for EventResults + """ + def __init__(self, dataflows: dict['DataflowRef', 'DataFlow'], collect_tag: OutputTag, out_tag: OutputTag) -> None: self.dataflows = dataflows + self.collect_tag = collect_tag + self.out_tag = out_tag def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction.Context): event, result = event_result @@ -96,23 +100,15 @@ def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction. else: logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") - yield from new_events - -def router_flat_map(event_result: tuple[Event, Any], dataflows: dict['DataflowRef', 'DataFlow']): - event, result = event_result - event = profile_event(event, "Router") - - # logger.debug(f"RouterOperator Event entered: {event._id}") - - new_events = list(event.propogate(result, dataflows)) - - # if len(new_events) == 1 and isinstance(new_events[0], EventResult): - # logger.debug(f"RouterOperator: Returned {new_events[0]}") - # else: - # logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") - - return new_events - + for event in new_events: + if isinstance(event, Event): + if isinstance(event.target, CollectNode): + yield self.collect_tag, event + else: + yield event + else: + assert isinstance(event, EventResult) + yield self.out_tag, event class FlinkOperator(KeyedProcessFunction): """Wraps an `cascade.dataflow.datflow.StatefulOperator` in a KeyedProcessFunction so that it can run in Flink. @@ -606,9 +602,10 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka stateful_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateful_operators} stateless_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateless_operators} collect_tag = OutputTag("__COLLECT__") + result_tag = OutputTag("__EVENT_RESULT__") logger.debug(f"Stateful tags: {stateful_tags.items()}") logger.debug(f"Stateless tags: {stateless_tags.items()}") - fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags, collect_tag)).name("FANOUT OPERATOR")#.disable_chaining() + fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags)).name("FANOUT OPERATOR")#.disable_chaining() # create the streams self.stateful_op_streams = [] @@ -620,7 +617,6 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .key_by(lambda e: e.key) .process(flink_op) .name("STATEFUL OP: " + flink_op.operator.name()) - # .process(RouterOperator(self.dataflows)).name("ROUTER") ) self.stateful_op_streams.append(op_stream) @@ -632,7 +628,6 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka .get_side_output(tag) .process(flink_op) .name("STATELESS OP: " + flink_op.operator.name()) - # .process(RouterOperator(self.dataflows)).name("ROUTER") ) self.stateless_op_streams.append(op_stream) @@ -648,13 +643,16 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka else: raise RuntimeError("No operators found, were they added to the flink runtime with .add_*_operator()") + + op_routed = operator_streams.process(RouterOperator(self.dataflows, collect_tag, result_tag)).name("ROUTER (OP)") + collect_stream = ( - fanout + op_routed .get_side_output(collect_tag) .key_by(lambda e: e._id) # might not work in the future if we have multiple merges in one dataflow? .process(FlinkCollectOperator()) .name("Collect") - # .process(RouterOperator(self.dataflows)).name("ROUTER") + .process(RouterOperator(self.dataflows, collect_tag, result_tag)) ) """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" @@ -662,14 +660,12 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka # broadcast_dataflows = self.env.broadcast_variable("dataflows", list(self.dataflows.items())) # union with EventResults or Events that don't have a CollectNode target - ds = collect_stream.union(operator_streams)#.flat_map(lambda x: router_flat_map(x, {u: v for u, v in self.dataflows.items()})) - ds = ds.process(RouterOperator(self.dataflows)).name("ROUTER") # Output the stream results = ( - ds - .filter(lambda e: isinstance(e, EventResult)) + op_routed.get_side_output(result_tag).union(collect_stream.get_side_output(result_tag)) + # .filter(lambda e: isinstance(e, EventResult)) # .map(lambda e: profile_event(e, "EXTERNAL SINK")) .map(lambda e: timestamp_result(e)) ) @@ -683,8 +679,8 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka raise ValueError(f"Invalid output: {output}") ds_internal = ( - ds - .filter(lambda e: isinstance(e, Event)) + op_routed.union(collect_stream) + # .filter(lambda e: isinstance(e, Event)) # .map(lambda e: profile_event(e, "INTERNAL SINK")) .map(lambda e: timestamp_event(e)) .sink_to(self.kafka_internal_sink) diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py index 54a29d3..b29d426 100644 --- a/tests/integration/flink/test_collect_operator.py +++ b/tests/integration/flink/test_collect_operator.py @@ -25,7 +25,7 @@ def test_collect_operator(): user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] - df_parallel = parallelize(user_buy_2) + df_parallel, _ = parallelize_until_if(user_buy_2) df_parallel.name = "buy_2_parallel" cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel print(df_parallel.to_dot()) From 27d8b15f4c5971c378b47b8db76b75aeea15c774 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:30:46 +0200 Subject: [PATCH 36/37] Add prefetch experiment --- deathstar_movie_review/demo.py | 10 +- .../entities/compose_review.py | 59 ---- deathstar_movie_review/entities/entities.py | 24 +- deathstar_movie_review/entities/frontend.py | 139 -------- deathstar_movie_review/entities/movie.py | 72 ----- deathstar_movie_review/entities/text.py | 30 -- deathstar_movie_review/entities/unique_id.py | 32 -- deathstar_movie_review/entities/user.py | 36 --- deathstar_movie_review/start_benchmark.py | 3 +- .../start_prefetch_experiment.py | 300 ++++++++++++++++++ .../test_movie_review_demo.py | 58 +++- ... => run_experiments_gil_workaround copy.py | 0 run_prefetch_exp.py | 68 ++++ src/cascade/dataflow/dataflow.py | 20 +- src/cascade/runtime/flink_runtime.py | 8 +- 15 files changed, 478 insertions(+), 381 deletions(-) delete mode 100644 deathstar_movie_review/entities/compose_review.py delete mode 100644 deathstar_movie_review/entities/frontend.py delete mode 100644 deathstar_movie_review/entities/movie.py delete mode 100644 deathstar_movie_review/entities/text.py delete mode 100644 deathstar_movie_review/entities/unique_id.py delete mode 100644 deathstar_movie_review/entities/user.py create mode 100644 deathstar_movie_review/start_prefetch_experiment.py rename run_experiments_gil_workaround.py => run_experiments_gil_workaround copy.py (100%) create mode 100755 run_prefetch_exp.py diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index b6ef32d..6cadb8e 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -15,7 +15,7 @@ OUT_TOPIC = "ds-movie-out" INTERNAL_TOPIC = "ds-movie-internal" -EXPERIMENT: Literal["baseline", "pipelined", "parallel"] = os.getenv("EXPERIMENT", "baseline") +EXPERIMENT: Literal["baseline", "parallel"] = os.getenv("EXPERIMENT", "baseline") def main(): @@ -25,12 +25,20 @@ def main(): print(f"Creating dataflow [{EXPERIMENT}]") + # for parallel experiment df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] df_parallel, _ = parallelize_until_if(df_baseline) df_parallel.name = "compose_parallel" cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel runtime.add_dataflow(df_parallel) + # for prefetch experiment + df_baseline = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "upload_movie_prefetch_parallel" + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = df_parallel + runtime.add_dataflow(df_parallel) + print(cascade.core.dataflows.keys()) runtime.run() diff --git a/deathstar_movie_review/entities/compose_review.py b/deathstar_movie_review/entities/compose_review.py deleted file mode 100644 index 853e34b..0000000 --- a/deathstar_movie_review/entities/compose_review.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Any - -from cascade.dataflow.operator import StatefulOperator - - -class ComposeReview: - def __init__(self, req_id: str, *args): # *args is a temporary hack to allow for creation of composereview on the fly - self.req_id = req_id - self.review_data = {} - - def upload_unique_id(self, review_id: int): - self.review_data["review_id"] = review_id - - # could use the User class instead? - def upload_user_id(self, user_id: str): - self.review_data["userId"] = user_id - - def upload_movie_id(self, movie_id: str): - self.review_data["movieId"] = movie_id - - def upload_rating(self, rating: int): - self.review_data["rating"] = rating - - def upload_text(self, text: str): - self.review_data["text"] = text - - def get_data(self): - return self.review_data - -def upload_unique_id_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["review_id"] = variable_map["review_id"] - -def upload_user_id_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["userId"] = variable_map["user_id"] - -def upload_movie_id_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["movieId"] = variable_map["movie_id"] - -def upload_rating_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["rating"] = variable_map["rating"] - -def upload_text_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["text"] = variable_map["text"] - -def get_data_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - return state.review_data - -compose_review_op = StatefulOperator( - ComposeReview, - { - "upload_unique_id": upload_unique_id_compiled, - "upload_user_id": upload_user_id_compiled, - "upload_movie_id": upload_movie_id_compiled, - "upload_rating": upload_rating_compiled, - "upload_text": upload_text_compiled, - "get_data": get_data_compiled, - }, - {} -) \ No newline at end of file diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py index 3079dc6..7fa30ae 100644 --- a/deathstar_movie_review/entities/entities.py +++ b/deathstar_movie_review/entities/entities.py @@ -45,12 +45,32 @@ def __init__(self, title: str, movie_id: str): self.movie_id = movie_id def upload_movie(self, review: ComposeReview, rating: int): - cond = self.movie_id is not None + cond = rating is not None if cond: + review.upload_rating(rating) movie_id = self.movie_id review.upload_movie_id(movie_id) + return True else: - review.upload_rating(rating) + movie_id = self.movie_id + review.upload_movie_id(movie_id) + return False + + # if without else isn't invented yet, otherwise this would be + # cond = rating is not None + # if cond: + # review.upload_rating(rating) + # movie_id = self.movie_id + # review.upload_movie_id(movie_id) + + def upload_movie_prefetch(self, review: ComposeReview, rating: int): + cond = rating is not None + movie_id = self.movie_id + + review.upload_rating(rating) + review.upload_movie_id(movie_id) + return cond + @cascade class Frontend(): diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py deleted file mode 100644 index 62c4e1e..0000000 --- a/deathstar_movie_review/entities/frontend.py +++ /dev/null @@ -1,139 +0,0 @@ -from typing import Any - -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator -from deathstar_movie_review.entities.compose_review import ComposeReview -from deathstar_movie_review.entities.movie import MovieId -from deathstar_movie_review.entities.unique_id import UniqueId, unique_id_op -from deathstar_movie_review.entities.user import User -from deathstar_movie_review.entities.text import Text, text_op - -CHAR_LIMIT = 50 - -# frontend is made stateless -class Frontend(): - @staticmethod - def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text: str): - UniqueId.upload_unique_id_2(review) - user.upload_user(review) - title.upload_movie(review, rating) - # text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! - Text.upload_text_2(review, text) - - - -###### COMPILED FUNCTIONS ###### - -def compose_compiled_0(variable_map: dict[str, Any]): - pass - - -frontend_op = StatelessOperator( - Frontend, - { - "empty": compose_compiled_0, - }, - {} -) - -def frontend_df_serial(): - # This dataflow calls many other dataflows. - # It could be more useful to have a "Dataflow" node - df = DataFlow("compose") - n0 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload Unique DF - n1_a = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) - n1_b = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") - - n2 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload User DF - n3_a = OpNode(User, InvokeMethod("upload_user_compiled_0"), read_key_from="user") - n3_b = OpNode(ComposeReview, InvokeMethod("upload_user_id"), read_key_from="review") - - n4 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload Movie DF - n5_a = OpNode(MovieId, InvokeMethod("upload_movie_cond"), read_key_from="title", is_conditional=True) - n5_b = OpNode(ComposeReview, InvokeMethod("upload_movie_id"), read_key_from="review") - n5_c = OpNode(ComposeReview, InvokeMethod("upload_rating"), read_key_from="review") - - n6 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload Text DF - n7a = StatelessOpNode(text_op, InvokeMethod("upload_text_2")) - n7 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") - - n8 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - df.add_edge(Edge(n0, n1_a)) - df.add_edge(Edge(n1_a, n1_b)) - df.add_edge(Edge(n1_b, n2)) - - df.add_edge(Edge(n2, n3_a)) - df.add_edge(Edge(n3_a, n3_b)) - df.add_edge(Edge(n3_b, n4)) - - df.add_edge(Edge(n4, n5_a)) - df.add_edge(Edge(n5_a, n5_b, if_conditional=True)) - df.add_edge(Edge(n5_a, n5_c, if_conditional=False)) - df.add_edge(Edge(n5_b, n6)) - df.add_edge(Edge(n5_c, n6)) - - df.add_edge(Edge(n6, n7a)) - df.add_edge(Edge(n7a, n7)) - df.add_edge(Edge(n7, n8)) - - df.entry = [n0] - return df - -def frontend_df_parallel(): - # This dataflow calls many other dataflows. - # It could be more useful to have a "Dataflow" node - df = DataFlow("compose") - # n0 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - ct = CollectNode() - - # Upload Unique DF - n1_a = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) - n1_b = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review", collect_target=CollectTarget(ct, 4, 0)) - - - # Upload User DF - n3_a = OpNode(User, InvokeMethod("upload_user_compiled_0"), read_key_from="user") - n3_b = OpNode(ComposeReview, InvokeMethod("upload_user_id"), read_key_from="review", collect_target=CollectTarget(ct, 4, 1)) - - - # Upload Movie DF - n5_a = OpNode(MovieId, InvokeMethod("upload_movie_cond"), read_key_from="title", is_conditional=True) - n5_b = OpNode(ComposeReview, InvokeMethod("upload_movie_id"), read_key_from="review", collect_target=CollectTarget(ct, 4, 2)) - n5_c = OpNode(ComposeReview, InvokeMethod("upload_rating"), read_key_from="review", collect_target=CollectTarget(ct, 4, 2)) - - - # Upload Text DF - n7 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review",collect_target=CollectTarget(ct, 4, 3)) - - - # df.add_edge(Edge(n0, n1_a)) - df.add_edge(Edge(n1_a, n1_b)) - df.add_edge(Edge(n1_b, ct)) - - # df.add_edge(Edge(n0, n3_a)) - df.add_edge(Edge(n3_a, n3_b)) - df.add_edge(Edge(n3_b, ct)) - - # df.add_edge(Edge(n0, n5_a)) - df.add_edge(Edge(n5_a, n5_b, if_conditional=True)) - df.add_edge(Edge(n5_a, n5_c, if_conditional=False)) - df.add_edge(Edge(n5_b, ct)) - df.add_edge(Edge(n5_c, ct)) - - # df.add_edge(Edge(n0, n7)) - df.add_edge(Edge(n7, ct)) - - df.entry = [n1_a, n3_a, n5_a, n7] - return df - -frontend_op.dataflows["compose"] = frontend_df_parallel() - diff --git a/deathstar_movie_review/entities/movie.py b/deathstar_movie_review/entities/movie.py deleted file mode 100644 index ade4d19..0000000 --- a/deathstar_movie_review/entities/movie.py +++ /dev/null @@ -1,72 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator -from deathstar_movie_review.entities.compose_review import ComposeReview -from deathstar_movie_review.entities.user import User - - -class MovieId: - # key: 'title' - def __init__(self, title: str, movie_id: str): - self.title = title - self.movie_id = movie_id - - def upload_movie(self, review: ComposeReview, rating: int): - if self.movie_id is not None: - review.upload_movie_id(self.movie_id) - else: - review.upload_rating(rating) - - -def upload_movie_compiled_cond_0(variable_map: dict[str, Any], state: MovieId) -> Any: - variable_map["movie_id"] = state.movie_id # SSA - return variable_map["movie_id"] is not None - -movie_id_op = StatefulOperator( - MovieId, - { - "upload_movie_cond": upload_movie_compiled_cond_0 - }, - {} -) - -def upload_movie_df(): - df = DataFlow("movieId_upload_movie") - n0 = OpNode(MovieId, InvokeMethod("upload_movie_cond"), read_key_from="title", is_conditional=True) - n1 = OpNode(ComposeReview, InvokeMethod("upload_movie_id"), read_key_from="review") - n2 = OpNode(ComposeReview, InvokeMethod("upload_rating"), read_key_from="review") - - df.add_edge(Edge(n0, n1, if_conditional=True)) - df.add_edge(Edge(n0, n2, if_conditional=False)) - df.entry = n0 - return df - -movie_id_op.dataflows["upload_movie"] = upload_movie_df() - - - -### Other movie-related operators - -# key: movie_id - -class Plot: - def __init__(self, movie_id: str, plot: str): - self.movie_id = movie_id - self.plot = plot - -class MovieInfo: - def __init__(self, movie_id: str, info: dict): - self.movie_id = movie_id - self.info = info - -movie_info_op = StatefulOperator( - MovieInfo, - {}, - {} -) - -plot_op = StatefulOperator( - Plot, - {}, - {} -) \ No newline at end of file diff --git a/deathstar_movie_review/entities/text.py b/deathstar_movie_review/entities/text.py deleted file mode 100644 index ebccf44..0000000 --- a/deathstar_movie_review/entities/text.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator -from deathstar_movie_review.entities.compose_review import ComposeReview - -class Text(): - @staticmethod - def upload_text_2(review: ComposeReview, text: str): - review.upload_text(text) - - -###### COMPILED FUNCTIONS ###### - -def upload_text_2_compiled_0(variable_map: dict[str, Any]): - pass - -text_op = StatelessOperator( - Text, - { - "upload_text_2": upload_text_2_compiled_0 - }, - {} -) - -df = DataFlow("upload_text") -n0 = StatelessOpNode(text_op, InvokeMethod("upload_text_2")) -n1 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") -df.add_edge(Edge(n0, n1)) -df.entry = [n0] -text_op.dataflows[df.name] = df \ No newline at end of file diff --git a/deathstar_movie_review/entities/unique_id.py b/deathstar_movie_review/entities/unique_id.py deleted file mode 100644 index 7972857..0000000 --- a/deathstar_movie_review/entities/unique_id.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Any -import uuid -from cascade.dataflow.dataflow import DataFlow, InvokeMethod, OpNode, StatelessOpNode -from cascade.dataflow.operator import Block, StatelessOperator -from deathstar_movie_review.entities.compose_review import ComposeReview - -class UniqueId(): - @staticmethod - def upload_unique_id_2(review: ComposeReview): - review_id = uuid.uuid1().int >> 64 - review.upload_unique_id(review_id) - - - -###### COMPILED FUNCTIONS ###### - -def upload_unique_compiled_0(variable_map: dict[str, Any]): - variable_map["review_id"] = uuid.uuid1().int >> 64 - -unique_id_op = StatelessOperator( - UniqueId, - { - "upload_unique": Block(name="upload_unique", function_call=upload_unique_compiled_0, var_map_writes=["review_id"], var_map_reads=[]), - }, - {} -) - -df = DataFlow("upload_unique_id") -n0 = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) -n1 = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") -df.entry = [n0] -unique_id_op.dataflows[df.name] = df diff --git a/deathstar_movie_review/entities/user.py b/deathstar_movie_review/entities/user.py deleted file mode 100644 index c73511c..0000000 --- a/deathstar_movie_review/entities/user.py +++ /dev/null @@ -1,36 +0,0 @@ -from typing import Any -from deathstar_movie_review.entities.compose_review import ComposeReview -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import Block, StatefulOperator - - -class User: - def __init__(self, username: str, user_data: dict): - self.username = username - self.user_data = user_data - - def upload_user(self, review: ComposeReview): - review.upload_user_id(self.user_data["userId"]) - - -def upload_user_compiled_0(variable_map: dict[str, Any], state: User) -> Any: - variable_map["user_id"] = state.user_data["userId"] - -user_op = StatefulOperator( - User, - { - "upload_user_compiled_0": upload_user_compiled_0, - }, - {} -) - -def upload_df(): - df = DataFlow("user_upload_user") - n0 = OpNode(User, InvokeMethod("upload_user_compiled_0"), read_key_from="username") - n1 = OpNode(ComposeReview, InvokeMethod("upload_user_id"), read_key_from="review") - - df.add_edge(Edge(n0, n1)) - df.entry = n0 - return df - -user_op.dataflows["upload_user"] = upload_df() \ No newline at end of file diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index 0b577b2..270c9e2 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -83,7 +83,8 @@ def compose_review(req_id, parallel=False): username = f"username_{user_index}" password = f"password_{user_index}" title = random.choice(movie_titles) - rating = random.randint(0, 10) + rating = None + # rating = random.randint(0, 10) text = ''.join(random.choice(charset) for _ in range(256)) if parallel: diff --git a/deathstar_movie_review/start_prefetch_experiment.py b/deathstar_movie_review/start_prefetch_experiment.py new file mode 100644 index 0000000..f932989 --- /dev/null +++ b/deathstar_movie_review/start_prefetch_experiment.py @@ -0,0 +1,300 @@ +from collections import Counter +import hashlib +from multiprocessing import Pool +import time +from typing import Literal, Optional +import uuid +import pandas as pd +import random + + +from .movie_data import movie_data +from .workload_data import movie_titles, charset +import sys +import os +from timeit import default_timer as timer +import argparse + +# import cascade +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) + +from tests.integration.flink.utils import init_cascade_from_module, init_flink_runtime +import cascade +from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.dataflow import DataflowRef,EventResult +from cascade.runtime.flink_runtime import FlinkClientSync + +IN_TOPIC = "ds-movie-in" +OUT_TOPIC = "ds-movie-out" +# threads = 1 +# messages_per_burst = 10 +# sleeps_per_burst = 10 +# sleep_time = 0.08 +# seconds_per_burst = 1 +# bursts = 100 + +def populate_compose_review(client: FlinkClientSync): + cr_init = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")] + for i in range(1000): + event = cr_init.generate_event({"req_id": str(i)}, key=str(i)) + client.send(event) + + +def populate_movie(client: FlinkClientSync): + movieinfo_init = cascade.core.dataflows[DataflowRef("MovieInfo", "__init__")] + plot_init = cascade.core.dataflows[DataflowRef("Plot", "__init__")] + movieid_init = cascade.core.dataflows[DataflowRef("MovieId", "__init__")] + + for movie in movie_data: + movie_id = movie["MovieId"] + + # movie info -> write `movie` + event = movieinfo_init.generate_event({"movie_id": movie_id, "info": movie}, key=movie_id) + client.send(event) + + # plot -> write "plot" + event = plot_init.generate_event({"movie_id": movie_id, "plot": "plot"}, key=movie_id) + client.send(event) + + # movie_id_op -> register movie id + event = movieid_init.generate_event({"title": movie["Title"], "movie_id": movie_id}, key=movie["Title"]) + client.send(event) + + +def upload_movie(rating_chance: float, prefetch=False): + assert 0 <= rating_chance <= 1 + + if random.random() < rating_chance: + rating = random.randint(0, 10) + else: + rating = None + title = random.choice(movie_titles) + req_id = random.randint(0, 999) + + if prefetch: + movie_id = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] + else: + movie_id = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie")] + + return movie_id.generate_event({ + "review_0": str(req_id), + "rating_0": rating + }, key=title) + +def deathstar_workload_generator(rating_chance: float, prefetch=False): + c = 1 + while True: + yield upload_movie(rating_chance, prefetch) + c += 1 + + +def benchmark_runner(args) -> dict[int, dict]: + proc_num, requests_per_second, sleep_time, bursts, prefetch, rating_chance = args + print(f'Generator: {proc_num} starting') + client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + deathstar_generator = deathstar_workload_generator(rating_chance, prefetch) + start = timer() + + for b in range(bursts): + sec_start = timer() + + # send burst of messages + for i in range(requests_per_second): + + # sleep sometimes between messages + # if i % (messages_per_burst // sleeps_per_burst) == 0: + time.sleep(sleep_time) + event = next(deathstar_generator) + client.send(event) + + client.flush() + sec_end = timer() + + # wait out the second + lps = sec_end - sec_start + if lps < 1: + time.sleep(1 - lps) + sec_end2 = timer() + print(f'Latency per burst: {sec_end2 - sec_start} ({b+1}/{bursts})') + + end = timer() + avg_send_latency = (end - start) / bursts + print(f'Average send latency per burst for generator {proc_num} was: {avg_send_latency}') + if avg_send_latency > 1.1: + print(f'This is higher than expected (1). Maybe increase the number of threads?') + futures = wait_for_futures(client) + client.close() + return futures + +def wait_for_futures(client: FlinkClientSync): + done = False + while not done: + done = True + for event_id, fut in client._futures.items(): + result = fut["ret"] + if result is None: + done = False + time.sleep(0.5) + break + futures = client._futures + return futures + + +def write_dict_to_pkl(futures_dict, filename): + """ + Writes a dictionary of event data to a pickle file. + + Args: + futures_dict (dict): A dictionary where each key is an event ID and the value is another dict. + filename (str): The name of the pickle file to write to. + """ + + # Prepare the data for the DataFrame + data = [] + for event_id, event_data in futures_dict.items(): + ret: EventResult = event_data.get("ret") + row = { + "event_id": event_id, + "sent": str(event_data.get("sent")), + "sent_t": event_data.get("sent_t"), + "ret": str(event_data.get("ret")), + "ret_t": event_data.get("ret_t"), + "roundtrip": ret.metadata["roundtrip"] if ret else None, + "flink_time": ret.metadata["flink_time"] if ret else None, + "deser_times": ret.metadata["deser_times"] if ret else None, + "loops": ret.metadata["loops"] if ret else None, + "latency": event_data["ret_t"][1] - event_data["sent_t"][1] if ret else None + } + data.append(row) + + # Create a DataFrame and save it as a pickle file + df = pd.DataFrame(data) + + # Multiply flink_time by 1000 to convert to milliseconds + df['flink_time'] = df['flink_time'] * 1000 + + return df + +def main(): + parser = argparse.ArgumentParser(description="Run the benchmark and save results.") + parser.add_argument("-o", "--output", type=str, default="benchmark_results.pkl", help="Output file name for the results") + parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") + parser.add_argument("--seconds", type=int, default=100, help="Number of seconds to benchmark for") + parser.add_argument("--threads", type=int, default=1, help="Number of concurrent threads") + parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") + parser.add_argument("--branch_chance", type=float, default=0.5, help="Brance chance") + parser.add_argument("--no_init", action="store_true", help="Don't populate") + args = parser.parse_args() + + rps_per_thread = int(args.requests_per_second / args.threads) + sleep_time = 0.95 / rps_per_thread + + EXPERIMENT = args.experiment + + print(f"Experiment [{EXPERIMENT}]") + print(f"Starting with args:\n{args}") + print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") + + init_cascade_from_module("deathstar_movie_review.entities.entities") + + init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + + # for prefetch experiment + df_baseline = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "upload_movie_prefetch_parallel" + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = df_parallel + + for df in cascade.core.dataflows.values(): + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + + if not args.no_init: + print("Populating...") + populate_compose_review(init_client) + populate_movie(init_client) + init_client.producer.flush() + wait_for_futures(init_client) + print("Done.") + time.sleep(1) + + print("Starting benchmark") + prefetch = args.experiment == "prefetch" + + func_args = [(t, rps_per_thread, sleep_time, args.seconds, prefetch, args.branch_chance) for t in range(args.threads)] + with Pool(args.threads) as p: + results = p.map(benchmark_runner, func_args) + + results = {k: v for d in results for k, v in d.items()} + + print("last result:") + print(list(results.values())[-1]) + t = len(results) + r = 0 + for result in results.values(): + if result["ret"] is not None: + r += 1 + + print(f"{r}/{t} results recieved.") + print(f"Writing results to {args.output}") + + count = Counter([r["ret"].result for r in results.values()]) + print(count) + + df = write_dict_to_pkl(results, args.output) + + flink_time = df['flink_time'].median() + latency = df['latency'].median() + flink_prct = float(flink_time) * 100 / latency + print(f"Median latency : {latency:.2f} ms") + print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") + init_client.close() + + df = preprocess(args.output, df) + df.to_pickle(args.output) + + +import re + +def preprocess(name, df, warmup_time_s=3) -> pd.DataFrame: + # Extract parallelism and mps from the name using regex + match = re.search(r'(.+)_p-(\d+)_rps-(\d+)', name) + if match: + experiment = match.group(1) + parallelism = int(match.group(2)) + mps = int(match.group(3)) + else: + raise Exception() + + # Ignore the first warmup_time seconds of events + warmup_events = int(warmup_time_s * mps) + df = df.iloc[warmup_events:] + + # Calculate the additional Kafka overhead + # df['kafka_overhead'] = df['latency'] - df['flink_time'] + + # Extract median values from df + flink_time_median = df['flink_time'].median() + latency_median = df['latency'].median() + flink_time_99_percentile = df['flink_time'].quantile(0.99) + latency_99_percentile = df['latency'].quantile(0.99) + flink_time_95_percentile = df['flink_time'].quantile(0.95) + latency_95_percentile = df['latency'].quantile(0.95) + + data = { + 'experiment': experiment, + 'parallelism': parallelism, + 'mps': mps, + 'flink_time_median': flink_time_median, + 'latency_median': latency_median, + 'latency_99_percentile': latency_99_percentile, + 'latency_95_percentile': latency_95_percentile, + 'flink_time_99_percentile': flink_time_99_percentile, + 'flink_time_95_percentile': flink_time_95_percentile + } + data = {k:[v] for k,v in data.items()} + return pd.DataFrame(data) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 11e93d1..fa26440 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -113,7 +113,7 @@ def deathstar_movie_demo(client): "review": req_id, "user": username, "title": movie_title, - "rating": 5, + "rating": None, "text": "good movie!" } @@ -156,7 +156,7 @@ def deathstar_movie_demo(client): "review": req_id, "user": username, "title": movie_title, - "rating": 2, + "rating": None, "text": "bad movie!" } @@ -179,4 +179,56 @@ def deathstar_movie_demo(client): assert "review_id" in result del result["review_id"] # randomly generated - assert result == expected \ No newline at end of file + assert result == expected + + +@pytest.mark.integration +def test_deathstar_movie_demo_prefetch_flink(): + print("starting") + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + + + runtime = utils.init_flink_runtime("deathstar_movie_review.entities.entities") + + # for prefetch experiment + df_baseline = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "upload_movie_prefetch_parallel" + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = df_parallel + + runtime.add_dataflow(df_parallel) + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 2 + + + client = FlinkClientSync() + runtime.run(run_async=True) + + try: + deathstar_prefetch(client) + finally: + client.close() + +def deathstar_prefetch(client): + event = cascade.core.dataflows[DataflowRef("MovieId", "__init__")].generate_event({"title": "cars", "movie_id": 1}, "cars") + result = client.send(event, block=True) + print("movie made") + + + # make the review + event = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")].generate_event({"req_id": "100"}, "100") + result = client.send(event, block=True) + print("review made") + + + event = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie")].generate_event({"review_0": "100", "rating_0": 3}, "cars") + result = client.send(event, block=True) + print("movie uploaded") + + event = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")].generate_event({"review_0": "100", "rating_0": 3}, "cars") + result = client.send(event, block=True) + print("movie uploaded w/ prefetch") \ No newline at end of file diff --git a/run_experiments_gil_workaround.py b/run_experiments_gil_workaround copy.py similarity index 100% rename from run_experiments_gil_workaround.py rename to run_experiments_gil_workaround copy.py diff --git a/run_prefetch_exp.py b/run_prefetch_exp.py new file mode 100755 index 0000000..b3dc058 --- /dev/null +++ b/run_prefetch_exp.py @@ -0,0 +1,68 @@ +import os +import subprocess +import time + +def rps(num, branch_chance, producer_threads=1): + return { + "threads": producer_threads, + "requests_per_second": num, + "seconds": 2, + "branch_chance": branch_chance + } + + +# Define experiment parameters as a list of dictionaries +experiments = [ + {"parallelism": 1, "benchmark_args": {**rps(1, 0.1, producer_threads=1)}}, + {"parallelism": 1, "benchmark_args": {**rps(1, 0.5, producer_threads=1)}}, + {"parallelism": 1, "benchmark_args": {**rps(1, 0.9, producer_threads=1)}}, +] + + + + +print("Tearing down docker containers") +subprocess.run(["docker", "compose", "down"], check=False) + +for e in ["prefetch"]: + for exp in experiments: + print(f"Starting experiment {exp}") + + # Start docker compose + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}", "--force-recreate"], check=True, env={ + "TASK_SLOTS": "1" + }) + + time.sleep(10) + + # Run Flink job + + flink_cmd = [ + "flink", "run", "--pyFiles", "/home/lvanmol/cascade/src,/home/lvanmol/cascade", + "--pyModule", "deathstar_movie_review.demo", "-d", "-p", str(exp['parallelism']) + ] + env = os.environ + env["EXPERIMENT"] = e + subprocess.run(flink_cmd, check=True, env=env) + + # Start benchmark + filename = f"{e}_p-{exp['parallelism']}_rps-{exp['benchmark_args']['requests_per_second']}_chance-{exp['benchmark_args']['branch_chance']}.pkl" + benchmark_cmd = [ + "python", "-u", "-m", "deathstar_movie_review.start_prefetch_experiment", "--output", filename, "--experiment", e + ] + + for arg, val in exp['benchmark_args'].items(): + benchmark_cmd.append(f"--{arg}") + benchmark_cmd.append(str(val)) + subprocess.run(benchmark_cmd, check=True) + + # Sleep for experiment duration + # print(f"Sleeping for {exp['sleep']} seconds...") + # time.sleep(exp['sleep']) + + # Stop docker compose + subprocess.run(["docker", "compose", "down"], check=False) + + print(f"Experiment completed.") + +print("All experiments completed.") diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index 4489629..d29552a 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -71,6 +71,9 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di events.append(ev) return events + + def __str__(self) -> str: + return f"IF {self.predicate_var}" @dataclass class DataflowRef: @@ -122,9 +125,11 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di # If this CallRemote is a terminal node in event.dataflow, then we don't # need to go back to event.dataflow, so we don't add it to the call stack. # This node is terminal in event.dataflow iff len(targets) == 0 + new_call_stack = event.call_stack if len(targets) > 0: + new_call_stack = event.call_stack.copy() call = CallStackItem(event.dataflow, self.assign_result_to, event.variable_map, targets, key=event.key) - event.call_stack.append(call) + new_call_stack.append(call) return [Event( target, @@ -132,10 +137,13 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di self.dataflow, _id=event._id, metadata=event.metadata, - call_stack=event.call_stack, + call_stack=new_call_stack, key=new_key) for target in new_targets] + + def __str__(self) -> str: + return f"CALL {self.dataflow}" @dataclass @@ -159,6 +167,9 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di events.append(ev) return events + def __str__(self) -> str: + return f"LOCAL {self.method}" + @dataclass class CollectNode(Node): """A node in a `Dataflow` corresponding to a merge operator. @@ -179,6 +190,9 @@ def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: di key=event.key) for target in targets] + + def __str__(self) -> str: + return f"COLLECT {self.num_events}" @dataclass class Edge(): @@ -329,7 +343,7 @@ def to_dot(self) -> str: # Add nodes for node in self.nodes.values(): - lines.append(f' {node.id} [label="{node}"];') + lines.append(f' {node.id} [label="{str(node)}"];') # Add edges for edge in self.edges: diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index 86da1a1..463ba52 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -19,7 +19,7 @@ import logging logger = logging.getLogger("cascade") -logger.setLevel("INFO") +logger.setLevel("DEBUG") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) @@ -91,7 +91,7 @@ def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction. event, result = event_result event = profile_event(event, "Router") - logger.debug(f"RouterOperator Event entered: {event._id}") + logger.debug(f"RouterOperator Event entered: {event}") new_events = list(event.propogate(result, self.dataflows)) @@ -99,6 +99,8 @@ def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction. logger.debug(f"RouterOperator: Returned {new_events[0]}") else: logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") + for i, event in enumerate(new_events): + logger.debug(f"{event} {i+1}/{len(new_events)}") for event in new_events: if isinstance(event, Event): @@ -649,7 +651,7 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka collect_stream = ( op_routed .get_side_output(collect_tag) - .key_by(lambda e: e._id) # might not work in the future if we have multiple merges in one dataflow? + .key_by(lambda e: str(e._id) + "_" + str(e.target.id)) # might not work in the future if we have multiple merges in one dataflow? .process(FlinkCollectOperator()) .name("Collect") .process(RouterOperator(self.dataflows, collect_tag, result_tag)) From fc7900ad3bc76ff8a436459a9bcf400e21a675c3 Mon Sep 17 00:00:00 2001 From: Lucas Van Mol <16979353+lucasvanmol@users.noreply.github.com> Date: Thu, 24 Apr 2025 14:16:05 +0200 Subject: [PATCH 37/37] Add return node & preprocessing --- deathstar_movie_review/entities/entities.py | 3 +- .../start_prefetch_experiment.py | 14 ++- .../test_movie_review_demo.py | 3 +- .../dynamic_prefetching/run_prefetcher.py | 22 +--- run_prefetch_exp.py | 10 +- src/cascade/core.py | 9 +- src/cascade/dataflow/dataflow.py | 84 ++++++++++----- src/cascade/dataflow/operator.py | 4 +- .../dataflow/optimization/parallelization.py | 102 +++++++++++++++++- src/cascade/descriptors/class_descriptor.py | 3 +- src/cascade/frontend/ast_visitors/__init__.py | 5 - .../frontend/ast_visitors/replace_name.py | 6 +- .../frontend/ast_visitors/simplify_returns.py | 51 +++++++++ src/cascade/frontend/cfg/cfg_builder.py | 3 +- src/cascade/frontend/cfg/statement.py | 5 +- .../frontend/generator/dataflow_builder.py | 11 +- src/cascade/frontend/generator/local_block.py | 12 +-- src/cascade/frontend/generator/unparser.py | 2 +- src/cascade/frontend/util.py | 8 +- src/cascade/preprocessing.py | 15 +++ src/cascade/runtime/flink_runtime.py | 2 +- .../frontend/ast_visitors/test_self_rename.py | 59 +++++++--- .../ast_visitors/test_simplify_returns.py | 60 +++++++++++ .../ast_visitors/test_variable_getter.py | 6 +- .../dataflow_analysis/test_branches.py | 14 +-- .../test_dataflow_graph_builder.py | 8 +- .../dataflow_analysis/test_entities.py | 29 +++-- .../dataflow_analysis/test_split_functions.py | 18 ++-- .../flink/test_collect_operator.py | 4 +- tests/integration/pyruntime/test_programs.py | 80 +++++++++++--- tests/optimizations/deathstar_entities.py | 42 ++++++++ tests/optimizations/test_parallelize.py | 50 ++++++++- 32 files changed, 587 insertions(+), 157 deletions(-) create mode 100644 src/cascade/frontend/ast_visitors/simplify_returns.py create mode 100644 src/cascade/preprocessing.py create mode 100644 tests/frontend/ast_visitors/test_simplify_returns.py create mode 100644 tests/optimizations/deathstar_entities.py diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py index 7fa30ae..6d05439 100644 --- a/deathstar_movie_review/entities/entities.py +++ b/deathstar_movie_review/entities/entities.py @@ -90,7 +90,8 @@ def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text class Uuid: @staticmethod def gen_uuid(): - return uuid.uuid1().int >> 64 + x = uuid.uuid1().int >> 64 + return x @cascade(globals={'Uuid': Uuid}) class UniqueId(): diff --git a/deathstar_movie_review/start_prefetch_experiment.py b/deathstar_movie_review/start_prefetch_experiment.py index f932989..96d2b6f 100644 --- a/deathstar_movie_review/start_prefetch_experiment.py +++ b/deathstar_movie_review/start_prefetch_experiment.py @@ -129,13 +129,18 @@ def benchmark_runner(args) -> dict[int, dict]: def wait_for_futures(client: FlinkClientSync): done = False while not done: + num_done = 0 done = True for event_id, fut in client._futures.items(): result = fut["ret"] if result is None: done = False - time.sleep(0.5) - break + else: + num_done += 1 + + if not done: + print(f"{num_done}/{len(client._futures)}") + time.sleep(0.5) futures = client._futures return futures @@ -174,8 +179,11 @@ def write_dict_to_pkl(futures_dict, filename): df['flink_time'] = df['flink_time'] * 1000 return df - +import logging def main(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + parser = argparse.ArgumentParser(description="Run the benchmark and save results.") parser.add_argument("-o", "--output", type=str, default="benchmark_results.pkl", help="Output file name for the results") parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index fa26440..f11b59b 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -231,4 +231,5 @@ def deathstar_prefetch(client): event = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")].generate_event({"review_0": "100", "rating_0": 3}, "cars") result = client.send(event, block=True) - print("movie uploaded w/ prefetch") \ No newline at end of file + print("movie uploaded w/ prefetch") + print(result) \ No newline at end of file diff --git a/experiments/dynamic_prefetching/run_prefetcher.py b/experiments/dynamic_prefetching/run_prefetcher.py index b4a8ee6..91adad0 100644 --- a/experiments/dynamic_prefetching/run_prefetcher.py +++ b/experiments/dynamic_prefetching/run_prefetcher.py @@ -10,7 +10,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) import cascade -from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.optimization.parallelization import parallelize from cascade.runtime.flink_runtime import FlinkClientSync from cascade.dataflow.dataflow import DataFlow, DataflowRef, EventResult from tests.integration.flink.utils import create_topics, init_cascade_from_module, init_flink_runtime, wait_for_event_id @@ -24,22 +24,6 @@ OUT_TOPIC = "prefetcher-out" INTERNAL_TOPIC = "prefetcher-internal" -def gen_parallel(df): - par, rest = parallelize_until_if(df) - - # join the two dataflows - par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] - for edge in rest.edges: - par.add_edge(edge) - assert len(rest.entry) == 1 - assert len(par_exit) == 1 - par.add_edge_refs(par_exit[0], rest.entry[0].id, None) - - - print(par.to_dot()) - par.name = df.name + "_parallel" - return par - def main(): init_cascade_from_module("experiments.dynamic_prefetching.entities") @@ -55,10 +39,10 @@ def main(): prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] - pre_par = gen_parallel(prefetch) + pre_par = parallelize(prefetch) cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = pre_par - base_par = gen_parallel(baseline) + base_par = parallelize(baseline) cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] = base_par print(base_par.to_dot()) diff --git a/run_prefetch_exp.py b/run_prefetch_exp.py index b3dc058..b2879e8 100755 --- a/run_prefetch_exp.py +++ b/run_prefetch_exp.py @@ -6,16 +6,16 @@ def rps(num, branch_chance, producer_threads=1): return { "threads": producer_threads, "requests_per_second": num, - "seconds": 2, + "seconds": 50, "branch_chance": branch_chance } # Define experiment parameters as a list of dictionaries experiments = [ - {"parallelism": 1, "benchmark_args": {**rps(1, 0.1, producer_threads=1)}}, - {"parallelism": 1, "benchmark_args": {**rps(1, 0.5, producer_threads=1)}}, - {"parallelism": 1, "benchmark_args": {**rps(1, 0.9, producer_threads=1)}}, + {"parallelism": 4, "benchmark_args": {**rps(500, 0.1, producer_threads=10)}}, + {"parallelism": 4, "benchmark_args": {**rps(500, 0.5, producer_threads=10)}}, + {"parallelism": 4, "benchmark_args": {**rps(500, 0.9, producer_threads=10)}}, ] @@ -24,7 +24,7 @@ def rps(num, branch_chance, producer_threads=1): print("Tearing down docker containers") subprocess.run(["docker", "compose", "down"], check=False) -for e in ["prefetch"]: +for e in ["baseline", "prefetch"]: for exp in experiments: print(f"Starting experiment {exp}") diff --git a/src/cascade/core.py b/src/cascade/core.py index 8578ac1..9f20e36 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -2,22 +2,15 @@ from typing import Dict from klara.core import nodes -from klara.core.tree_rewriter import AstBuilder -from klara.core.cfg import Cfg from cascade.dataflow.operator import StatefulOperator, StatelessOperator, Operator +from cascade.preprocessing import setup_cfg from cascade.wrappers import ClassWrapper from cascade.descriptors import ClassDescriptor from cascade.frontend.generator.dataflow_builder import DataflowBuilder from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass -def setup_cfg(code: str) -> Cfg: - as_tree = AstBuilder().string_build(code) - cfg = Cfg(as_tree) - cfg.convert_to_ssa() - return cfg, as_tree - parse_cache: Dict[str, nodes.Module] = {} diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index d29552a..1d49674 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field +import logging from typing import Any, Iterable, List, Mapping, Optional, Union from typing import TYPE_CHECKING import uuid -import cascade if TYPE_CHECKING: from cascade.frontend.generator.local_block import CompiledLocalBlock @@ -41,14 +41,35 @@ def __post_init__(self): Node._id_counter += 1 @abstractmethod - def propogate(self, event: 'Event', targets: list['Node'], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> list['Event']: + def propogate(self, event: 'Event', targets: list['Node'], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> list['Event']: pass +@dataclass +class Return(Node): + return_var: str + """The name of the local variable to return.""" + + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + events = [] + for target in targets: + ev = Event( + target, + event.variable_map, + event.dataflow, + call_stack=event.call_stack, + _id=event._id, + metadata=event.metadata, + key=event.key) + + events.append(ev) + return events + @dataclass class IfNode(Node): predicate_var: str + """The name of the local (boolean) variable to use as predicate.""" - def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: if_cond = event.variable_map[self.predicate_var] targets = [] @@ -108,7 +129,7 @@ class CallRemote(Node): keyby: Optional[str] = None """The key, for calls to Stateful Entities""" - def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow']) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow']) -> List['Event']: # remap the variable map of event into the new event new_var_map = {key: event.variable_map[value] for key, value in self.variable_rename.items()} if self.keyby: @@ -150,7 +171,7 @@ def __str__(self) -> str: class CallLocal(Node): method: Union[InvokeMethod, InitClass] - def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: # For simple calls, we only need to change the target. # Multiple targets results in multiple events events = [] @@ -178,7 +199,7 @@ class CollectNode(Node): Their actual implementation is runtime-dependent.""" num_events: int - def propogate(self, event: 'Event', targets: List[Node], result: Any, df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: return [Event( target, event.variable_map, @@ -330,6 +351,11 @@ def remove_node_by_id(self, node_id: int): def get_neighbors(self, node: Node) -> List[Node]: """Get the outgoing neighbors of this `Node`""" + return [edge.to_node for edge in node.outgoing_edges] + # TODO: there is a bug with the underlying adjacency_list: + # it doesn't get updated properly sometimes (during parallelization), + # but seemingly only when modifiying when running flink without minicluster + # mode. return [self.nodes[id] for id in self.adjacency_list.get(node.id, [])] def get_predecessors(self, node: Node) -> List[Node]: @@ -357,21 +383,22 @@ def to_dot(self) -> str: return "\n".join(lines) def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> list['Event']: - assert len(self.entry) != 0 - # give all the events the same id - first_event = Event(self.entry[0], variable_map, self.ref(), key=key) - id = first_event._id - events = [first_event] + [Event(entry, variable_map, self.ref(), _id=id, key=key) for entry in self.entry[1:]] - - # TODO: propogate at "compile time" instead of doing this every time - local_events = [] - for ev in events: - if isinstance(ev.target, CallRemote) or isinstance(ev.target, IfNode): - local_events.extend(ev.propogate(None, cascade.core.dataflows)) - else: - local_events.append(ev) + import cascade + assert len(self.entry) != 0 + # give all the events the same id + first_event = Event(self.entry[0], variable_map, self.ref(), key=key) + id = first_event._id + events = [first_event] + [Event(entry, variable_map, self.ref(), _id=id, key=key) for entry in self.entry[1:]] + + # TODO: propogate at "compile time" instead of doing this every time + local_events = [] + for ev in events: + if isinstance(ev.target, CallRemote) or isinstance(ev.target, IfNode): + local_events.extend(ev.propogate(None, cascade.core.dataflows)) + else: + local_events.append(ev) - return local_events + return local_events def __str__(self) -> str: @@ -444,7 +471,8 @@ def propogate(self, result: Any, df_map: dict['DataflowRef','DataFlow']) -> Iter new_targets = [new_targets] var_map = caller.var_map if (x := caller.assign_result_to): - var_map[x] = result + assert isinstance(self.target, Return), type(self.target) + var_map[x] = self.variable_map[self.target.return_var] for target in new_targets: ev = Event( @@ -456,17 +484,19 @@ def propogate(self, result: Any, df_map: dict['DataflowRef','DataFlow']) -> Iter metadata=self.metadata, key=caller.key ) - events.append(ev) - - else: - yield EventResult(self._id, result, self.metadata) + events.append(ev) + else: + if isinstance(self.target, Return): + yield EventResult(self._id, self.variable_map[self.target.return_var], self.metadata) + else: + yield EventResult(self._id, result, self.metadata) return else: current_node = self.target - events = current_node.propogate(self, targets, result, df_map) + events = current_node.propogate(self, targets, df_map) for event in events: - if isinstance(event.target, CallRemote) or isinstance(event.target, IfNode): + if isinstance(event.target, CallRemote) or isinstance(event.target, IfNode) or isinstance(event.target, Return): # recursively propogate CallRemote events yield from event.propogate(None, df_map) else: diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index 03b0576..acca252 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -124,7 +124,7 @@ def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, A The state `T` is passed along to the function, and may be modified. """ - return self.methods[method.method_name].call_block(variable_map=variable_map, state=state) + return self.methods[method.method_name].call_block(variable_map=variable_map, __state=state) def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) @@ -152,7 +152,7 @@ def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, A The state `T` is passed along to the function, and may be modified. """ - return self.methods[method.method_name].call_block(variable_map=variable_map, state=None) + return self.methods[method.method_name].call_block(variable_map=variable_map, __state=None) def get_method_rw_set(self, method_name: str): return super().get_method_rw_set(method_name) diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index ca126e4..95a7bf5 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -1,7 +1,7 @@ import copy from dataclasses import dataclass from typing import Any, Tuple -from cascade.dataflow.dataflow import CallRemote, CallLocal, CollectNode, DataFlow, Edge, IfNode, Node +from cascade.dataflow.dataflow import CallRemote, CallLocal, CollectNode, DataFlow, Edge, IfNode, Node, Return import cascade @dataclass @@ -9,10 +9,108 @@ class AnnotatedNode: node: Node reads: set[str] writes: set[str] - + +def parallelize(df): + par, rest = parallelize_until_if(df) + + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for node in rest.nodes.values(): + par.add_node(node) + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + + par.name = df.name + "_parallel" + return par import networkx as nx def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: + """Parallelize df, stopping at the first if node. + The first dataflow returned is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" + # create the dependency graph + ans = [] + # since we use SSA, every variable has exactly one node that writes it + write_nodes = {} + graph = nx.DiGraph() + for node in df.nodes.values(): + if isinstance(node, CallRemote): + reads = set(node.variable_rename.values()) + writes = {result} if (result := node.assign_result_to) else set() + elif isinstance(node, CallLocal): + method = df.blocks[node.method.method_name] + reads = method.reads + writes = method.writes + elif isinstance(node, Return): + break + elif isinstance(node, IfNode): + break + else: + raise ValueError(f"unsupported node type: {type(node)}") + + write_nodes.update({var: node.id for var in writes}) + + ans.append(AnnotatedNode(node, reads, writes)) + graph.add_node(node.id) + + # Add the edges in the dependency graph + nodes_with_indegree_0 = set(graph.nodes) + n_map = copy.deepcopy(df.nodes) + for node in ans: + for read in node.reads: + if read in write_nodes: + # "read" will not be in write nodes if it is part of the arguments + # a more thorough implementation would not need the if check, + # and add the arguments as writes to some function entry node + graph.add_edge(write_nodes[read], node.node.id) + try: + nodes_with_indegree_0.remove(node.node.id) + except KeyError: + pass + + + + updated = DataFlow(df.name, df.operator_name) + # updated.blocks = df.blocks + updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] + + rest = copy.deepcopy(df) + + collectors = {} + finishers = set() + for u in graph.nodes: + updated.add_node(n_map[u]) + rest.remove_node_by_id(u) + if graph.in_degree(u) > 1: + c = CollectNode(0) + updated.add_node(c) + collectors[u] = c.id + updated.add_edge_refs(c.id, u) + elif graph.out_degree(u) == 0: + finishers.add(u) + + if len(finishers) > 1: + c = CollectNode(0) + updated.add_node(c) + for f in finishers: + c.num_events += 1 + updated.add_edge_refs(f, c.id) + + + for u, v in graph.edges: + if v in collectors: + v = collectors[v] + updated.nodes[v].num_events += 1 + + updated.add_edge_refs(u, v, None) + + + return updated, rest + +import networkx as nx +def parallelize_until_if_DEPRECATED(df: DataFlow) -> Tuple[DataFlow, DataFlow]: """Parallelize df, stopping at the first if node. The first dataflow returned is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" # create the dependency graph diff --git a/src/cascade/descriptors/class_descriptor.py b/src/cascade/descriptors/class_descriptor.py index 7924b02..efd6277 100644 --- a/src/cascade/descriptors/class_descriptor.py +++ b/src/cascade/descriptors/class_descriptor.py @@ -2,8 +2,9 @@ from typing import Any, Optional from klara.core import nodes -from cascade.frontend.ast_visitors import ExtractClassDefNode, ExtractMethodVisitor from cascade.descriptors.method_descriptor import MethodDescriptor +from cascade.frontend.ast_visitors.extract_class_def_node import ExtractClassDefNode +from cascade.frontend.ast_visitors.extract_class_methods import ExtractMethodVisitor class ClassDescriptor: """A description of a class.""" diff --git a/src/cascade/frontend/ast_visitors/__init__.py b/src/cascade/frontend/ast_visitors/__init__.py index 32feb97..e69de29 100644 --- a/src/cascade/frontend/ast_visitors/__init__.py +++ b/src/cascade/frontend/ast_visitors/__init__.py @@ -1,5 +0,0 @@ -from .extract_type_visitor import ExtractTypeVisitor -from .contains_attribute_visitor import ContainsAttributeVisitor -from .variable_getter import VariableGetter -from .extract_class_def_node import ExtractClassDefNode -from .extract_class_methods import ExtractMethodVisitor \ No newline at end of file diff --git a/src/cascade/frontend/ast_visitors/replace_name.py b/src/cascade/frontend/ast_visitors/replace_name.py index c578c8c..2b6a4a7 100644 --- a/src/cascade/frontend/ast_visitors/replace_name.py +++ b/src/cascade/frontend/ast_visitors/replace_name.py @@ -3,15 +3,15 @@ from klara.core import nodes class ReplaceSelfWithState(AstVisitor): - """Replace attributes with "self" into "state", and remove SSA versioning. + """Replace attributes with "self" into "__state", and remove SSA versioning. e.g.: - self_0.balance_0 -> state.balance + self_0.balance_0 -> __state['balance'] """ def __init__(self): self.target: str = "self" - self.new: str = "state" + self.new: str = "__state" @classmethod def replace(cls, node): diff --git a/src/cascade/frontend/ast_visitors/simplify_returns.py b/src/cascade/frontend/ast_visitors/simplify_returns.py new file mode 100644 index 0000000..443d1b9 --- /dev/null +++ b/src/cascade/frontend/ast_visitors/simplify_returns.py @@ -0,0 +1,51 @@ +from klara.core.ssa_visitors import AstVisitor +from klara.core import nodes + +def simplify_returns(node): + sr = SimplifyReturns.replace(node) + for parent, n, target in sr.inserts: + try: + i = parent.body.index(n) + parent.body.insert(i, target) + except ValueError as e: + if isinstance(parent, nodes.If): + i = parent.orelse.index(n) + parent.orelse.insert(i, target) + else: + raise e + +class SimplifyReturns(AstVisitor): + """Replace attributes with "self" into "state", and remove SSA versioning. + + e.g.: + self_0.balance_0 -> state.balance + """ + + def __init__(self): + self.temps = 0 + self.inserts = [] + + @classmethod + def replace(cls, node): + c = cls() + c.visit(node) + return c + + def replace_name(self, node: nodes.Return): + new_assign = nodes.Assign(parent=node.parent, lineno=node.lineno) + target = nodes.AssignName(parent=new_assign) + target.postinit(id=f"__ret_{self.temps}") + self.temps += 1 + new_assign.postinit(targets=[target], value=node.value) + node.value = nodes.Name() + node.value.postinit(target.id) + + assert hasattr(node.parent, "body"), type(node.parent) + print(f"replacing {node} in {node.parent} with {new_assign}") + self.inserts.append((node.parent, node, new_assign)) + + def visit_return(self, node: nodes.Return): + + if not isinstance(node.value, nodes.Name): + self.replace_name(node) + diff --git a/src/cascade/frontend/cfg/cfg_builder.py b/src/cascade/frontend/cfg/cfg_builder.py index 6e35128..29aa63d 100644 --- a/src/cascade/frontend/cfg/cfg_builder.py +++ b/src/cascade/frontend/cfg/cfg_builder.py @@ -1,8 +1,9 @@ from klara.core.cfg import ModuleLabel, TempAssignBlock from klara.core import nodes +from cascade.frontend.ast_visitors.contains_attribute_visitor import ContainsAttributeVisitor +from cascade.frontend.ast_visitors.variable_getter import VariableGetter from cascade.frontend.cfg import Statement, ControlFlowGraph -from cascade.frontend.ast_visitors import ContainsAttributeVisitor, VariableGetter class ControlFlowGraphBuilder: diff --git a/src/cascade/frontend/cfg/statement.py b/src/cascade/frontend/cfg/statement.py index a8e4783..8185e61 100644 --- a/src/cascade/frontend/cfg/statement.py +++ b/src/cascade/frontend/cfg/statement.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from klara.core.cfg import RawBasicBlock -from klara.core.nodes import Attribute +from klara.core.nodes import Attribute, Return @dataclass class Statement: @@ -32,5 +32,8 @@ def set_attribute(self, attribute: Attribute): def is_remote(self) -> bool: return self.remote_call + def is_return(self) -> bool: + return isinstance(self.block, Return) + def __hash__(self): return hash(self.block_num) diff --git a/src/cascade/frontend/generator/dataflow_builder.py b/src/cascade/frontend/generator/dataflow_builder.py index 36296ee..437d542 100644 --- a/src/cascade/frontend/generator/dataflow_builder.py +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -1,8 +1,9 @@ from typing import Any, Optional import networkx as nx -from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode +from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode, Return from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor +from cascade.frontend.ast_visitors.simplify_returns import SimplifyReturns from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder from cascade.frontend.cfg import Statement, ControlFlowGraph from cascade.frontend.generator.local_block import LocalBlock, to_entity_call @@ -23,13 +24,13 @@ def split_statements_once(statements: list[Statement]) -> tuple[list[Statement], """ assert len(statements) > 0 - if statements[0].is_remote(): + if statements[0].is_remote() or statements[0].is_return(): return [statements[0]], statements[1:] # find the next remote call i = 0 first_half = [] - while i < len(statements) and not statements[i].is_remote(): + while i < len(statements) and not statements[i].is_remote() and not statements[i].is_return(): first_half.append(statements[i]) i += 1 @@ -219,6 +220,10 @@ def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> Data rawblock = statement_block[0].block assert isinstance(rawblock, nodes.Bool), type(rawblock) node = IfNode(repr(rawblock.value)) + elif len(statement_block) == 1 and statement_block[0].is_return(): + rawblock = statement_block[0].block + assert isinstance(rawblock.value, nodes.Name), f"Return values must be simple names, not {type(rawblock.value)}: {repr(rawblock.value)}" + node = Return(repr(rawblock.value)) else: block = LocalBlock(list(statement_block), self.name, block_num, op_name, self.globals) block_num += 1 diff --git a/src/cascade/frontend/generator/local_block.py b/src/cascade/frontend/generator/local_block.py index 69aea72..0400ad8 100644 --- a/src/cascade/frontend/generator/local_block.py +++ b/src/cascade/frontend/generator/local_block.py @@ -3,7 +3,7 @@ from cascade.frontend.cfg import Statement -from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState +# from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState from cascade.frontend.generator.unparser import unparse from cascade.dataflow.dataflow import CallRemote, CallLocal, DataFlow, DataflowRef, InvokeMethod @@ -102,14 +102,14 @@ def to_string(self) -> str: return compiled_method_as_string def get_method_signature(self) -> str: - return f'variable_map, state' + return f'variable_map, __state' def body_to_string(self) -> str: body = [] # Read from the variable map for v in sorted(self.reads - self.writes): - if not (v in [ 'self_0','self']): + if v != "__state": body.append(f'{v} = variable_map[\'{v}\']') # Write statements @@ -118,8 +118,8 @@ def body_to_string(self) -> str: if type(block) == nodes.FunctionDef: continue - # TODO: do this in preprocessing - ReplaceSelfWithState.replace(block) + # # TODO: do this in preprocessing + # ReplaceSelfWithState.replace(block) body.append(unparse(block)) @@ -128,7 +128,7 @@ def body_to_string(self) -> str: for v in sorted(self.writes - self.reads): if not (v in [ 'self_0','self']): body.append(f'variable_map[\'{v}\'] = {v}') - body.append('return None') + # body.append('return None') return "\n".join(body) diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index 0fb8659..1e677e6 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -57,6 +57,6 @@ def unparse(block: RawBasicBlock): res = unparse(block.values[0]) for v in block.values[1:]: res += " {} {}".format(block.op, unparse(v)) - return res + return res case _: raise NotImplementedError(f"{type(block)}: {block}") diff --git a/src/cascade/frontend/util.py b/src/cascade/frontend/util.py index 22f10e3..0f3d29d 100644 --- a/src/cascade/frontend/util.py +++ b/src/cascade/frontend/util.py @@ -5,6 +5,8 @@ from klara.core.tree_rewriter import AstBuilder from klara.core.cfg import Cfg +from cascade.frontend.ast_visitors.simplify_returns import simplify_returns + color_map_map = {0: 'b', 1:'g', 2:'r', 3:'c', 4:'m', 5:'y', 6:'k', -1:'pink'} @@ -41,12 +43,6 @@ def plot_dataflow_graph(G: nx.DiGraph, grey_background: bool = True): if grey_background: fig.set_facecolor('darkgrey') -def setup_cfg(code: str) -> Cfg: - as_tree = AstBuilder().string_build(code) - cfg = Cfg(as_tree) - cfg.convert_to_ssa() - return cfg - def to_camel_case(name): return re.sub(r'(? tuple[Cfg, nodes.Module]: + as_tree = AstBuilder().string_build(code) + cfg = Cfg(as_tree) + cfg.convert_to_ssa() + if preprocess: + ReplaceSelfWithState.replace(as_tree) + simplify_returns(as_tree) + # TODO: do this in preprocessing + return cfg, as_tree \ No newline at end of file diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index 463ba52..66faba6 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -19,7 +19,7 @@ import logging logger = logging.getLogger("cascade") -logger.setLevel("DEBUG") +logger.setLevel("INFO") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) diff --git a/tests/frontend/ast_visitors/test_self_rename.py b/tests/frontend/ast_visitors/test_self_rename.py index d047d0c..0cae724 100644 --- a/tests/frontend/ast_visitors/test_self_rename.py +++ b/tests/frontend/ast_visitors/test_self_rename.py @@ -1,31 +1,60 @@ from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState -from cascade.frontend.util import setup_cfg -from cascade.frontend.ast_visitors.variable_getter import VariableGetter +from cascade.preprocessing import setup_cfg from klara.core import nodes def test_replace_self_with_state(): code = "self.balance = self.balance + 10" - cfg = setup_cfg(code) - ssa_code = cfg.block_list[1].ssa_code - node, = ssa_code.code_list - ReplaceSelfWithState.replace(node) + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + assert isinstance(tree, nodes.Module) + node = tree.body[0] assert isinstance(node, nodes.Assign) assert isinstance(node.targets, list) assert isinstance(node.value, nodes.BinOp) - assert str(node.targets[0]) == "state['balance']" - assert str(node.value.left) == "state['balance']" + assert str(node.targets[0]) == "__state['balance']" + assert str(node.value.left) == "__state['balance']" def test_replace_self_with_state_dict(): code = "self.data['b'] = self.data['a'] + self.balance" - cfg = setup_cfg(code) - ssa_code = cfg.block_list[1].ssa_code - node, = ssa_code.code_list - ReplaceSelfWithState.replace(node) + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + assert isinstance(tree, nodes.Module) + node = tree.body[0] assert isinstance(node, nodes.Assign) assert isinstance(node.targets, list) assert isinstance(node.value, nodes.BinOp) - assert str(node.targets[0]) == "state['data']['b']" - assert str(node.value.left) == "state['data']['a']" - assert str(node.value.right) == "state['balance']" \ No newline at end of file + assert str(node.targets[0]) == "__state['data']['b']" + assert str(node.value.left) == "__state['data']['a']" + assert str(node.value.right) == "__state['balance']" + +def test_replace_self_assign(): + code = "__ret_2 = self.price" + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + + assert isinstance(tree, nodes.Module) + node = tree.body[0] + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.Subscript), type(node.value) + assert str(node.targets[0]) == "__ret_2" + assert str(node.value) == "__state['price']" + print(str(node)) + +def test_replace_self_assign_after_return(): + code = "__ret_2 = self.price" + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + assert isinstance(tree, nodes.Module) + node = tree.body[0] + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.Subscript), type(node.value) + assert str(node.targets[0]) == "__ret_2" + assert str(node.value) == "__state['price']" + print(str(node)) \ No newline at end of file diff --git a/tests/frontend/ast_visitors/test_simplify_returns.py b/tests/frontend/ast_visitors/test_simplify_returns.py new file mode 100644 index 0000000..02199b7 --- /dev/null +++ b/tests/frontend/ast_visitors/test_simplify_returns.py @@ -0,0 +1,60 @@ +from cascade.frontend.ast_visitors.simplify_returns import SimplifyReturns, simplify_returns +from cascade.frontend.generator.unparser import unparse +from cascade.preprocessing import setup_cfg +from klara.core import nodes +from klara.core.tree_rewriter import AstBuilder +from klara.core.cfg import Cfg + +def setup_cfg_no_ssa(code: str) -> Cfg: + as_tree = AstBuilder().string_build(code) + cfg = Cfg(as_tree) + return cfg, as_tree + +def test_simplify_return_state(): + code = "return self.balance" + cfg, tree = setup_cfg_no_ssa(code) + for s in tree.get_statements(): + print(repr(s)) + sr = SimplifyReturns.replace(tree) + simplify_returns(tree) + + for s in tree.get_statements(): + print(repr(s)) + +def test_simplify_return_name(): + code = "return cat" + cfg, tree = setup_cfg_no_ssa(code) + for s in tree.get_statements(): + print(repr(s)) + sr = SimplifyReturns.replace(tree) + simplify_returns(tree) + + for s in tree.get_statements(): + print(repr(s)) + +def test_simplify_return_binop(): + code = """a = 1 +return 4+1""" + cfg, tree = setup_cfg_no_ssa(code) + + for s in tree.get_statements(): + print(repr(s)) + simplify_returns(tree) + + for s in tree.get_statements(): + print(repr(s)) + +def test_simplify_return_multiple(): + code = """a = 1 +if a == 1: + return 3 + 2 +else: + return a""" + cfg, tree = setup_cfg_no_ssa(code) + + for b in tree.get_statements(): + print(repr(b)) + simplify_returns(tree) + + for b in tree.get_statements(): + print(repr(b)) \ No newline at end of file diff --git a/tests/frontend/ast_visitors/test_variable_getter.py b/tests/frontend/ast_visitors/test_variable_getter.py index f59fa5d..ea168f4 100644 --- a/tests/frontend/ast_visitors/test_variable_getter.py +++ b/tests/frontend/ast_visitors/test_variable_getter.py @@ -1,10 +1,10 @@ -from cascade.frontend.util import setup_cfg +from cascade.preprocessing import setup_cfg from cascade.frontend.ast_visitors.variable_getter import VariableGetter def test_variable_getter(): code = "item_price = item.get_price()" - cfg = setup_cfg(code) + cfg, _ = setup_cfg(code) ssa_code = cfg.block_list[1].ssa_code node, = ssa_code.code_list variable_getter = VariableGetter.get_variable(node) @@ -16,7 +16,7 @@ def test_variable_getter(): def test_variable_getter_attr(): code = "self.balance = self.balance + 1" - cfg = setup_cfg(code) + cfg, _ = setup_cfg(code, preprocess=False) ssa_code = cfg.block_list[1].ssa_code node, = ssa_code.code_list variable_getter = VariableGetter.get_variable(node) diff --git a/tests/frontend/dataflow_analysis/test_branches.py b/tests/frontend/dataflow_analysis/test_branches.py index a5f54cc..f2bc08e 100644 --- a/tests/frontend/dataflow_analysis/test_branches.py +++ b/tests/frontend/dataflow_analysis/test_branches.py @@ -2,7 +2,7 @@ from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode from cascade.frontend.generator.dataflow_builder import DataflowBuilder -from cascade.frontend.util import setup_cfg +from cascade.preprocessing import setup_cfg from klara.core import nodes @@ -17,7 +17,7 @@ def buy_item(self, item: 'Item') -> int: else: x = 10 return self.balance""") - cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -32,7 +32,7 @@ def buy_item(self, item: 'Item') -> int: df = sf.build(dataflows, "User") print(df.to_dot()) - assert len(df.nodes) == 6 + assert len(df.nodes) == 7 ifnode = None for node in df.nodes.values(): if isinstance(node, IfNode): @@ -53,7 +53,7 @@ def buy_item(self, item: 'Item') -> int: else: x = 10 return self.balance""") - cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -85,7 +85,7 @@ def buy_item(self, item: 'Item') -> int: item_price = item.get_price() msg = str(item_price) + " is too expensive!" return msg""") - cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -112,7 +112,7 @@ def buy_item(self, item: 'Item') -> int: self.balance = self.balance - item_price x = 0 return item_price""") - cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -149,7 +149,7 @@ def buy_item(self, item: 'Item') -> int: item_price = item.get_price() msg = "item is too expensive!" return msg""") - cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index 350a1c5..68c5a9b 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -5,7 +5,7 @@ from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder from cascade.frontend.cfg import Statement, ControlFlowGraph -from cascade.frontend.util import setup_cfg +from cascade.preprocessing import setup_cfg def test_linear_program(): @@ -18,7 +18,7 @@ def get_total(item1: Stock, item2: Stock): total = Adder.add(q1, q2) return total""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -41,7 +41,7 @@ def get_total(item1: Stock, item2: Stock): total = Adder.add(item1.get_quantity(), item2.get_quantity()) return total""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -67,7 +67,7 @@ def test_branches(item1: Stock, item2: Stock): a = 0 return a""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list print(blocks) test_class: nodes.Block = blocks[2] diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py index 99cd82d..6a3ffca 100644 --- a/tests/frontend/dataflow_analysis/test_entities.py +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -8,7 +8,7 @@ from cascade.dataflow.dataflow import CallRemote, CallLocal, DataFlow, DataflowRef from cascade.frontend.generator.dataflow_builder import DataflowBuilder -from cascade.frontend.util import setup_cfg +from cascade.preprocessing import setup_cfg def test_call_entity(): program: str = dedent(""" @@ -18,7 +18,7 @@ def get_total(item1: Stock, item2: Stock): a = item1.get_quantity() b = item2.get_quantity() return a+b""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -34,7 +34,7 @@ def get_total(item1: Stock, item2: Stock): df = sf.build(dataflows, "Test") ## TODO: check blocks/df - assert len(df.nodes) == 3 + assert len(df.nodes) == 4 assert len(df.entry) == 1 entry = df.entry[0] assert isinstance(entry, CallRemote) @@ -54,7 +54,7 @@ class Test: def add(x: int, y: int): return x+y""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -68,7 +68,11 @@ def add(x: int, y: int): df = sf.build(dataflows, "Test") assert len(df.blocks) == 1 - assert list(df.blocks.values())[0].call_block({"x_0": 3, "y_0":5 }, None) == 8 + block = list(df.blocks.values())[0] + print(block.function_string) + var_map = {"x_0": 3, "y_0":5 } + block.call_block(var_map, None) + assert sorted(list(var_map.values())) == [3, 5, 8] def test_state(): @@ -80,7 +84,7 @@ def buy_item(self, item: 'Item') -> bool: return self.balance >= 0 """) - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list user_class = blocks[2] buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] @@ -119,7 +123,7 @@ def upload_unique_id(self, review_id: int): self.review_data["review_id"] = review_id """) - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list user_class = blocks[2] upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] @@ -161,7 +165,7 @@ def rand(): return r """) - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list user_class = blocks[2] upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] @@ -181,5 +185,12 @@ def rand(): for block in df.blocks.values(): print(block.function_string) - rands = {df.blocks['rand_0'].call_block(variable_map={}, state=None) for x in range(10)} + rands = set() + for _ in range(10): + var_map = {} + df.blocks['rand_0'].call_block(variable_map=var_map, __state=None) + assert len(var_map) == 1 + r = var_map['r_0'] + rands.add(r) + assert len(rands) == 10 \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py index c5a68dc..8f25dfb 100644 --- a/tests/frontend/dataflow_analysis/test_split_functions.py +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -8,7 +8,7 @@ from cascade.dataflow.dataflow import DataFlow, DataflowRef from cascade.frontend.generator.dataflow_builder import DataflowBuilder, blocked_cfg, split_cfg from cascade.frontend.cfg.control_flow_graph import ControlFlowGraph -from cascade.frontend.util import setup_cfg +from cascade.preprocessing import setup_cfg def test_entity_calls(): program: str = dedent(""" @@ -23,7 +23,7 @@ def get_total(item1: Stock, item2: Stock, y: int): total = total + a + b total = total - 23 return total""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -46,7 +46,7 @@ def get_total(item1: Stock, item2: Stock, y: int): print(block.function_string) # TODO: Check # entity calls, # of local calls - assert len(df.nodes) == 5 + assert len(df.nodes) == 6 assert len(df.blocks) == 2 def test_branching(): @@ -62,7 +62,7 @@ def test_branching(self) -> int: orelser = 30 post = 40 return 50""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -87,7 +87,7 @@ def test_branching(self) -> int: print(df.to_dot()) for block in df.blocks.values(): print(block.function_string) - assert len(df.nodes) == 5 + assert len(df.nodes) == 6 assert len(df.blocks) == 4 def print_digraph(graph: nx.DiGraph): @@ -119,7 +119,7 @@ def test_branching(self) -> int: x = 10 post = 40 return 50""") - cfg: Cfg = setup_cfg(program) + cfg, _ = setup_cfg(program) blocks = cfg.block_list test_class: nodes.Block = blocks[2] get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] @@ -130,8 +130,10 @@ def test_branching(self) -> int: new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) assert len(list(new.nodes)) == 5 + print(new.nodes) new_split = split_cfg(new) - assert len(list(new_split.nodes)) == 7 + print(new_split.nodes) + assert len(list(new_split.nodes)) == 8 dataflows = { DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []), @@ -146,7 +148,7 @@ def test_branching(self) -> int: for block in df.blocks.values(): print(block.function_string) - assert len(df.nodes) == 7 + assert len(df.nodes) == 8 assert len(df.blocks) == 5 def test_block_merging(): diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py index b29d426..df9b089 100644 --- a/tests/integration/flink/test_collect_operator.py +++ b/tests/integration/flink/test_collect_operator.py @@ -2,7 +2,7 @@ from pyflink.datastream.data_stream import CloseableIterator from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.optimization.parallelization import parallelize, parallelize_until_if from cascade.runtime.flink_runtime import FlinkClientSync import tests.integration.flink.utils as utils @@ -25,7 +25,7 @@ def test_collect_operator(): user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] - df_parallel, _ = parallelize_until_if(user_buy_2) + df_parallel = parallelize(user_buy_2) df_parallel.name = "buy_2_parallel" cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel print(df_parallel.to_dot()) diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py index faa1ce1..d1fbd91 100644 --- a/tests/integration/pyruntime/test_programs.py +++ b/tests/integration/pyruntime/test_programs.py @@ -1,10 +1,8 @@ import cascade -import sys from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +from cascade.dataflow.optimization.parallelization import parallelize from tests.integration.pyruntime.utils import init_python_runtime @@ -17,6 +15,8 @@ def test_checkout_item(): user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] user_buy_item = cascade.core.dataflows[DataflowRef("User", "buy_item")] item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + item_get_price = cascade.core.dataflows[DataflowRef("Item", "get_price")] + event = item_init.generate_event({"item_name": "fork", "price": 10}, key="fork") result = client.send(event) @@ -28,20 +28,31 @@ def test_checkout_item(): assert result["price"] == 20 assert result["item_name"] == "spoon" + print(list(item_get_price.blocks.values())[0].function_string) + + event = item_get_price.generate_event({}, key="spoon") + result = client.send(event) + assert result == 20 + + event = item_get_price.generate_event({}, key="fork") + result = client.send(event) + assert result == 10 + event = user_init.generate_event({"username": "test", "balance": 15}, key="test") user = client.send(event) assert user["balance"] == 15 assert user["username"] == "test" + print(user_buy_item.to_dot()) event = user_buy_item.generate_event({"item_0": "fork"}, key=user["username"] ) result = client.send(event) assert runtime.statefuloperators["User"].states["test"]["balance"] == 5 assert result - event = user_buy_item.generate_event({"item_0": "spoon"}, key=user["username"] ) - result = client.send(event) - assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 - assert not result + # event = user_buy_item.generate_event({"item_0": "spoon"}, key=user["username"] ) + # result = client.send(event) + # assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 + # assert not result def test_operator_chaining(): file_name = "tests.integration.pyruntime.operator_chaining" @@ -92,14 +103,14 @@ def test_branching_integration(): branch = cascade.core.dataflows[DataflowRef("Brancher", "branch")] print(branch.to_dot()) - event = branch.generate_event({"cond_0": True}) - result = client.send(event) - assert result == 33 - event = branch.generate_event({"cond_0": False}) result = client.send(event) assert result == 42 + event = branch.generate_event({"cond_0": True}) + result = client.send(event) + assert result == 33 + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch_insta")] print(branch.to_dot()) @@ -109,4 +120,49 @@ def test_branching_integration(): event = branch.generate_event({"cond_0": False}) result = client.send(event) - assert result == 42 \ No newline at end of file + assert result == 42 + +def test_collect_with_return(): + file_name = "tests.integration.common" + + runtime, client = init_python_runtime(file_name) + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + + df_parallel = parallelize(user_buy_2) + df_parallel.name = "buy_2_parallel" + cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 2 + + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + print(user_buy_2.to_dot()) + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + df_parallel = cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] + + event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") + result = client.send(event) + + + event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_init.generate_event({"key": "spoon", "price": 3}, key="spoon") + result = client.send(event) + + + # Buy a fork and spoon + print("sending buy 2") + event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + print(event) + result = client.send(event) + assert result == True + + # Check the balance + event = user_get_balance.generate_event({}, key="foo") + result = client.send(event) + assert result == (100 - 5 - 3) \ No newline at end of file diff --git a/tests/optimizations/deathstar_entities.py b/tests/optimizations/deathstar_entities.py new file mode 100644 index 0000000..dc43e58 --- /dev/null +++ b/tests/optimizations/deathstar_entities.py @@ -0,0 +1,42 @@ +from cascade import cascade + +@cascade +class ComposeReview: + def __init__(self, req_id: str, **kwargs): # **args is a temporary hack to allow for creation of composereview on the fly + self.req_id = req_id + self.review_data = {} + + def upload_unique_id(self, review_id: int): + self.review_data["review_id"] = review_id + + # could use the User class instead? + def upload_user_id(self, user_id: str): + self.review_data["userId"] = user_id + + def upload_movie_id(self, movie_id: str): + self.review_data["movieId"] = movie_id + + def upload_rating(self, rating: int): + self.review_data["rating"] = rating + + def upload_text(self, text: str): + self.review_data["text"] = text + + def get_data(self): + x = self.review_data + return x + +@cascade +class MovieId: + # key: 'title' + def __init__(self, title: str, movie_id: str): + self.title = title + self.movie_id = movie_id + + def upload_movie_prefetch(self, review: ComposeReview, rating: int): + cond = rating is not None + movie_id = self.movie_id + review.upload_rating(rating) + review.upload_movie_id(movie_id) + return cond + \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py index 578f2d7..e5fd833 100644 --- a/tests/optimizations/test_parallelize.py +++ b/tests/optimizations/test_parallelize.py @@ -7,7 +7,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) from cascade.dataflow.dataflow import DataflowRef -from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.optimization.parallelization import parallelize, parallelize_until_if from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime import cascade @@ -164,3 +164,51 @@ def test_code_motion(): +def test_a(): + cascade.core.clear() # clear cascadeds registerd classes. + assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ + Module" + # import the module + import_module_name: str = 'deathstar_entities' + exec(f'import tests.optimizations.{import_module_name}') + + cascade.core.init() + + prefetch = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + compose_init = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")] + movie_init = cascade.core.dataflows[DataflowRef("MovieId", "__init__")] + + print(prefetch.to_dot()) + prefetch_parallel = parallelize(prefetch) + print(prefetch_parallel.to_dot()) + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = prefetch_parallel + + compose_op = cascade.core.operators["ComposeReview"] + movie_op = cascade.core.operators["MovieId"] + + + runtime = PythonRuntime() + runtime.add_operator(compose_op) + runtime.add_operator(movie_op) + runtime.run() + client = PythonClientSync(runtime) + + + + e = compose_init.generate_event({"req_id": "1"}, key="1") + r = client.send(e) + print(r) + + e = movie_init.generate_event({"title": "cars", "movie_id": 1}, key="cars") + r = client.send(e) + print(r) + + print("---") + e = prefetch.generate_event({"review_0": "1", "rating_0": 2}, key="cars") + r = client.send(e) + print(r) + + print("---") + e = prefetch_parallel.generate_event({"review_0": "1", "rating_0": 2}, key="cars") + r = client.send(e) + print(r) \ No newline at end of file