diff --git a/.gitignore b/.gitignore index 9a91afa..842e1e7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,11 @@ __pycache__ *.egg-info build +.vscode/ + # Experiment artifacts *.png -*.pkl \ No newline at end of file +*.pkl +*.csv +nohup.out +*.zip \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index bdfc6f1..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python Debugger: Current File", - "type": "debugpy", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "justMyCode": false, - - } - ] -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 544eb77..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "python.testing.pytestArgs": [ - "tests", - "-s" - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true -} \ No newline at end of file diff --git a/deathstar_hotel_reservation/demo.py b/deathstar_hotel_reservation/demo.py index b54d643..63a6024 100644 --- a/deathstar_hotel_reservation/demo.py +++ b/deathstar_hotel_reservation/demo.py @@ -268,7 +268,7 @@ def user_login_workload_generator(): def benchmark_runner(proc_num) -> dict[int, dict]: print(f'Generator: {proc_num} starting') client = FlinkClientSync("deathstar", "ds-out", "localhost:9092", True) - deathstar_generator = user_login_workload_generator() + deathstar_generator = deathstar_workload_generator() start = timer() for _ in range(bursts): diff --git a/deathstar_hotel_reservation/test_demo.py b/deathstar_hotel_reservation/test_demo.py index dea227f..05302ce 100644 --- a/deathstar_hotel_reservation/test_demo.py +++ b/deathstar_hotel_reservation/test_demo.py @@ -1,100 +1,100 @@ -import os -import sys - -# import cascade -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) - -from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -from cascade.runtime.flink_runtime import FlinkClientSync, FlinkRuntime -from deathstar_hotel_reservation.demo import DeathstarDemo, recommend, reserve, search_hotel, user_login -import time -import pytest - -@pytest.mark.integration -def test_deathstar_demo(): - ds = DeathstarDemo() - ds.init_runtime(FlinkRuntime("deathstardemo-test", "dsd-out")) - ds.runtime.run(run_async=True) - print("Populating, press enter to go to the next step when done") - ds.populate() - - client = FlinkClientSync("deathstardemo-test", "dsd-out") - input() - print("testing user login") - event = user_login() - client.send(event) - - input() - print("testing reserve") - event = reserve() - client.send(event) - - input() - print("testing search") - event = search_hotel() - client.send(event) - - input() - print("testing recommend (distance)") - time.sleep(0.5) - event = recommend(req_param="distance") - client.send(event) - - input() - print("testing recommend (price)") - time.sleep(0.5) - event = recommend(req_param="price") - client.send(event) - - print(client._futures) - input() - print("done!") - print(client._futures) - -def test_deathstar_demo_python(): - ds = DeathstarDemo() - ds.init_runtime(PythonRuntime()) - ds.runtime.run() - print("Populating, press enter to go to the next step when done") - ds.populate() - - time.sleep(0.1) - - client = PythonClientSync(ds.runtime) - print("testing user login") - event = user_login() - result = client.send(event) - assert result == True - event = user_login(succesfull=False) - result = client.send(event) - assert result == False - - print("testing reserve") - event = reserve() - result = client.send(event) - assert result == True - - return - print("testing search") - event = search_hotel() - result = client.send(event) - print(result) - - print("testing recommend (distance)") - time.sleep(0.5) - event = recommend(req_param="distance") - result = client.send(event) - print(result) - - print("testing recommend (price)") - time.sleep(0.5) - event = recommend(req_param="price") - result = client.send(event) - print(result) - - print("done!") - - -if __name__ == "__main__": - test_deathstar_demo() \ No newline at end of file +# import os +# import sys + +# # import cascade +# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) + +# from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +# from cascade.runtime.flink_runtime import FlinkClientSync, FlinkRuntime +# from deathstar_hotel_reservation.demo import DeathstarDemo, recommend, reserve, search_hotel, user_login +# import time +# import pytest + +# @pytest.mark.integration +# def test_deathstar_demo(): +# ds = DeathstarDemo() +# ds.init_runtime(FlinkRuntime("deathstardemo-test", "dsd-out")) +# ds.runtime.run(run_async=True) +# print("Populating, press enter to go to the next step when done") +# ds.populate() + +# client = FlinkClientSync("deathstardemo-test", "dsd-out") +# input() +# print("testing user login") +# event = user_login() +# client.send(event) + +# input() +# print("testing reserve") +# event = reserve() +# client.send(event) + +# input() +# print("testing search") +# event = search_hotel() +# client.send(event) + +# input() +# print("testing recommend (distance)") +# time.sleep(0.5) +# event = recommend(req_param="distance") +# client.send(event) + +# input() +# print("testing recommend (price)") +# time.sleep(0.5) +# event = recommend(req_param="price") +# client.send(event) + +# print(client._futures) +# input() +# print("done!") +# print(client._futures) + +# def test_deathstar_demo_python(): +# ds = DeathstarDemo() +# ds.init_runtime(PythonRuntime()) +# ds.runtime.run() +# print("Populating, press enter to go to the next step when done") +# ds.populate() + +# time.sleep(0.1) + +# client = PythonClientSync(ds.runtime) +# print("testing user login") +# event = user_login() +# result = client.send(event) +# assert result == True +# event = user_login(succesfull=False) +# result = client.send(event) +# assert result == False + +# print("testing reserve") +# event = reserve() +# result = client.send(event) +# assert result == True + +# return +# print("testing search") +# event = search_hotel() +# result = client.send(event) +# print(result) + +# print("testing recommend (distance)") +# time.sleep(0.5) +# event = recommend(req_param="distance") +# result = client.send(event) +# print(result) + +# print("testing recommend (price)") +# time.sleep(0.5) +# event = recommend(req_param="price") +# result = client.send(event) +# print(result) + +# print("done!") + + +# if __name__ == "__main__": +# test_deathstar_demo() \ No newline at end of file diff --git a/deathstar_movie_review/demo.py b/deathstar_movie_review/demo.py index 60a623b..6cadb8e 100644 --- a/deathstar_movie_review/demo.py +++ b/deathstar_movie_review/demo.py @@ -1,14 +1,12 @@ from typing import Literal +import cascade +from cascade.dataflow.dataflow import DataflowRef from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination +from cascade.dataflow.optimization.parallelization import parallelize_until_if from cascade.runtime.flink_runtime import FlinkRuntime - -from .entities.user import user_op -from .entities.compose_review import compose_review_op -from .entities.frontend import frontend_df_parallel, frontend_df_serial, frontend_op, text_op, unique_id_op -from .entities.movie import movie_id_op, movie_info_op, plot_op +from tests.integration.flink.utils import create_topics, init_flink_runtime import os -from confluent_kafka.admin import AdminClient, NewTopic KAFKA_BROKER = "localhost:9092" KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different @@ -17,67 +15,32 @@ OUT_TOPIC = "ds-movie-out" INTERNAL_TOPIC = "ds-movie-internal" -EXPERIMENT: Literal["baseline", "pipelined", "parallel"] = os.getenv("EXPERIMENT", "baseline") - -def create_topics(*required_topics): - conf = { - "bootstrap.servers": KAFKA_BROKER - } - - admin_client = AdminClient(conf) - - # Fetch existing topics - existing_topics = admin_client.list_topics(timeout=5).topics.keys() - - # Find missing topics - missing_topics = [topic for topic in required_topics if topic not in existing_topics] - - if missing_topics: - print(f"Creating missing topics: {missing_topics}") - - # Define new topics (default: 1 partition, replication factor 1) - new_topics = [NewTopic(topic, num_partitions=1, replication_factor=1) for topic in missing_topics] - - # Create topics - futures = admin_client.create_topics(new_topics) - - # Wait for topic creation to complete - for topic, future in futures.items(): - try: - future.result() # Block until the operation is complete - print(f"Topic '{topic}' created successfully") - except Exception as e: - print(f"Failed to create topic '{topic}': {e}") - else: - print("All required topics exist.") +EXPERIMENT: Literal["baseline", "parallel"] = os.getenv("EXPERIMENT", "baseline") def main(): create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) - runtime = FlinkRuntime(IN_TOPIC, OUT_TOPIC, internal_topic=INTERNAL_TOPIC) - runtime.init(kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10) - - if EXPERIMENT == "baseline": - frontend_op.dataflow = frontend_df_serial() - elif EXPERIMENT == "pipelined": - frontend_op.dataflow = frontend_df_serial() - dead_node_elimination([], [frontend_op]) - elif EXPERIMENT == "parallel": - frontend_op.dataflow = frontend_df_parallel() - - print(frontend_op.dataflow.to_dot()) + runtime = init_flink_runtime("deathstar_movie_review.entities.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True, parallelism=None) + print(f"Creating dataflow [{EXPERIMENT}]") - runtime.add_operator(compose_review_op) - runtime.add_operator(user_op) - runtime.add_operator(movie_info_op) - runtime.add_operator(movie_id_op) - runtime.add_operator(plot_op) - runtime.add_stateless_operator(frontend_op) - runtime.add_stateless_operator(unique_id_op) - runtime.add_stateless_operator(text_op) - + # for parallel experiment + df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + runtime.add_dataflow(df_parallel) + + # for prefetch experiment + df_baseline = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "upload_movie_prefetch_parallel" + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = df_parallel + runtime.add_dataflow(df_parallel) + + print(cascade.core.dataflows.keys()) + runtime.run() if __name__ == "__main__": diff --git a/deathstar_movie_review/entities/compose_review.py b/deathstar_movie_review/entities/compose_review.py deleted file mode 100644 index 853e34b..0000000 --- a/deathstar_movie_review/entities/compose_review.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Any - -from cascade.dataflow.operator import StatefulOperator - - -class ComposeReview: - def __init__(self, req_id: str, *args): # *args is a temporary hack to allow for creation of composereview on the fly - self.req_id = req_id - self.review_data = {} - - def upload_unique_id(self, review_id: int): - self.review_data["review_id"] = review_id - - # could use the User class instead? - def upload_user_id(self, user_id: str): - self.review_data["userId"] = user_id - - def upload_movie_id(self, movie_id: str): - self.review_data["movieId"] = movie_id - - def upload_rating(self, rating: int): - self.review_data["rating"] = rating - - def upload_text(self, text: str): - self.review_data["text"] = text - - def get_data(self): - return self.review_data - -def upload_unique_id_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["review_id"] = variable_map["review_id"] - -def upload_user_id_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["userId"] = variable_map["user_id"] - -def upload_movie_id_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["movieId"] = variable_map["movie_id"] - -def upload_rating_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["rating"] = variable_map["rating"] - -def upload_text_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - state.review_data["text"] = variable_map["text"] - -def get_data_compiled(variable_map: dict[str, Any], state: ComposeReview) -> Any: - return state.review_data - -compose_review_op = StatefulOperator( - ComposeReview, - { - "upload_unique_id": upload_unique_id_compiled, - "upload_user_id": upload_user_id_compiled, - "upload_movie_id": upload_movie_id_compiled, - "upload_rating": upload_rating_compiled, - "upload_text": upload_text_compiled, - "get_data": get_data_compiled, - }, - {} -) \ No newline at end of file diff --git a/deathstar_movie_review/entities/entities.py b/deathstar_movie_review/entities/entities.py new file mode 100644 index 0000000..6d05439 --- /dev/null +++ b/deathstar_movie_review/entities/entities.py @@ -0,0 +1,121 @@ +import uuid +from cascade import cascade + +@cascade +class ComposeReview: + def __init__(self, req_id: str, **kwargs): # **args is a temporary hack to allow for creation of composereview on the fly + self.req_id = req_id + self.review_data = {} + + def upload_unique_id(self, review_id: int): + self.review_data["review_id"] = review_id + + # could use the User class instead? + def upload_user_id(self, user_id: str): + self.review_data["userId"] = user_id + + def upload_movie_id(self, movie_id: str): + self.review_data["movieId"] = movie_id + + def upload_rating(self, rating: int): + self.review_data["rating"] = rating + + def upload_text(self, text: str): + self.review_data["text"] = text + + def get_data(self): + x = self.review_data + return x + +@cascade +class User: + def __init__(self, username: str, user_data: dict): + self.username = username + self.user_data = user_data + + def upload_user(self, review: ComposeReview): + user_id = self.user_data["userId"] + review.upload_user_id(user_id) + +@cascade +class MovieId: + # key: 'title' + def __init__(self, title: str, movie_id: str): + self.title = title + self.movie_id = movie_id + + def upload_movie(self, review: ComposeReview, rating: int): + cond = rating is not None + if cond: + review.upload_rating(rating) + movie_id = self.movie_id + review.upload_movie_id(movie_id) + return True + else: + movie_id = self.movie_id + review.upload_movie_id(movie_id) + return False + + # if without else isn't invented yet, otherwise this would be + # cond = rating is not None + # if cond: + # review.upload_rating(rating) + # movie_id = self.movie_id + # review.upload_movie_id(movie_id) + + def upload_movie_prefetch(self, review: ComposeReview, rating: int): + cond = rating is not None + movie_id = self.movie_id + + review.upload_rating(rating) + review.upload_movie_id(movie_id) + return cond + + +@cascade +class Frontend(): + @staticmethod + def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text: str): + UniqueId.upload_unique_id_2(review) + user.upload_user(review) + title.upload_movie(review, rating) + # text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! + Text.upload_text_2(review, text) + + # TODO: promise pipelining + # uuid = UniqueId.generate() + # review.upload_unique_id(uuid) + + +class Uuid: + @staticmethod + def gen_uuid(): + x = uuid.uuid1().int >> 64 + return x + +@cascade(globals={'Uuid': Uuid}) +class UniqueId(): + @staticmethod + def upload_unique_id_2(review: ComposeReview): + # TODO: support external libraries + review_id = Uuid.gen_uuid() + review.upload_unique_id(review_id) + +@cascade +class Text(): + @staticmethod + def upload_text_2(review: ComposeReview, text: str): + review.upload_text(text) + + +@cascade +class Plot: + def __init__(self, movie_id: str, plot: str): + self.movie_id = movie_id + self.plot = plot + +@cascade +class MovieInfo: + def __init__(self, movie_id: str, info: dict): + self.movie_id = movie_id + self.info = info \ No newline at end of file diff --git a/deathstar_movie_review/entities/frontend.py b/deathstar_movie_review/entities/frontend.py deleted file mode 100644 index db75bc2..0000000 --- a/deathstar_movie_review/entities/frontend.py +++ /dev/null @@ -1,188 +0,0 @@ -from typing import Any -import uuid - -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator -from deathstar_movie_review.entities.compose_review import ComposeReview -from deathstar_movie_review.entities.movie import MovieId -from deathstar_movie_review.entities.user import User - - -# unique_id is stateless -class UniqueId(): - @staticmethod - def upload_unique_id_2(review: ComposeReview): - review_id = uuid.uuid1().int >> 64 - review.upload_unique_id(review_id) - -# text is stateless -class Text(): - @staticmethod - def upload_text_2(review: ComposeReview, text: str): - review.upload_text(text) - -CHAR_LIMIT = 50 - -# frontend is made stateless -class Frontend(): - @staticmethod - def compose(review: ComposeReview, user: User, title: MovieId, rating: int, text: str): - - # dead node elimination will remove "returning back" to the original function - # - # cascade could theoritically allow for more advanced analysis, - # that would enable all these to run in parallel. However, this is only - # possible because - # 1. the individual functions don't depend on each other - # 2. the ordering of side-effects does not matter - UniqueId.upload_unique_id_2(review) - user.upload_user(review) - title.upload_movie(review, rating) - - text = text[:CHAR_LIMIT] # an operation like this could be reorderd for better efficiency! - Text.upload_text_2(review, text) - -###### COMPILED FUNCTIONS ###### - -### UPLOAD UNIQUE ### - -def upload_unique_compiled_0(variable_map: dict[str, Any]): - variable_map["review_id"] = uuid.uuid1().int >> 64 - -unique_id_op = StatelessOperator( - { - "upload_unique": upload_unique_compiled_0, - }, - None -) - -df = DataFlow("upload_unique_id") -n0 = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) -n1 = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") -df.entry = n0 -unique_id_op.dataflow = df - -### TEXT ### - -text_op = StatelessOperator( - {}, - None -) - -df = DataFlow("upload_text") -n0 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") -df.entry = n0 -text_op.dataflow = df - -### FRONTEND ### - -def compose_compiled_0(variable_map: dict[str, Any]): - pass - - -frontend_op = StatelessOperator( - { - "empty": compose_compiled_0, - }, - None -) - -def frontend_df_serial(): - # This dataflow calls many other dataflows. - # It could be more useful to have a "Dataflow" node - df = DataFlow("compose") - n0 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload Unique DF - n1_a = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) - n1_b = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review") - - n2 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload User DF - n3_a = OpNode(User, InvokeMethod("upload_user_compiled_0"), read_key_from="user") - n3_b = OpNode(ComposeReview, InvokeMethod("upload_user_id"), read_key_from="review") - - n4 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload Movie DF - n5_a = OpNode(MovieId, InvokeMethod("upload_movie_cond"), read_key_from="title", is_conditional=True) - n5_b = OpNode(ComposeReview, InvokeMethod("upload_movie_id"), read_key_from="review") - n5_c = OpNode(ComposeReview, InvokeMethod("upload_rating"), read_key_from="review") - - n6 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - # Upload Text DF - n7 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review") - - n8 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - - df.add_edge(Edge(n0, n1_a)) - df.add_edge(Edge(n1_a, n1_b)) - df.add_edge(Edge(n1_b, n2)) - - df.add_edge(Edge(n2, n3_a)) - df.add_edge(Edge(n3_a, n3_b)) - df.add_edge(Edge(n3_b, n4)) - - df.add_edge(Edge(n4, n5_a)) - df.add_edge(Edge(n5_a, n5_b, if_conditional=True)) - df.add_edge(Edge(n5_a, n5_c, if_conditional=False)) - df.add_edge(Edge(n5_b, n6)) - df.add_edge(Edge(n5_c, n6)) - - df.add_edge(Edge(n6, n7)) - df.add_edge(Edge(n7, n8)) - - df.entry = n0 - return df - -def frontend_df_parallel(): - # This dataflow calls many other dataflows. - # It could be more useful to have a "Dataflow" node - df = DataFlow("compose") - # n0 = StatelessOpNode(frontend_op, InvokeMethod("empty")) - ct = CollectNode(assign_result_to="results", read_results_from="dummy") - - # Upload Unique DF - n1_a = StatelessOpNode(unique_id_op, InvokeMethod("upload_unique")) - n1_b = OpNode(ComposeReview, InvokeMethod("upload_unique_id"), read_key_from="review", collect_target=CollectTarget(ct, 4, 0)) - - - # Upload User DF - n3_a = OpNode(User, InvokeMethod("upload_user_compiled_0"), read_key_from="user") - n3_b = OpNode(ComposeReview, InvokeMethod("upload_user_id"), read_key_from="review", collect_target=CollectTarget(ct, 4, 1)) - - - # Upload Movie DF - n5_a = OpNode(MovieId, InvokeMethod("upload_movie_cond"), read_key_from="title", is_conditional=True) - n5_b = OpNode(ComposeReview, InvokeMethod("upload_movie_id"), read_key_from="review", collect_target=CollectTarget(ct, 4, 2)) - n5_c = OpNode(ComposeReview, InvokeMethod("upload_rating"), read_key_from="review", collect_target=CollectTarget(ct, 4, 2)) - - - # Upload Text DF - n7 = OpNode(ComposeReview, InvokeMethod("upload_text"), read_key_from="review",collect_target=CollectTarget(ct, 4, 3)) - - - # df.add_edge(Edge(n0, n1_a)) - df.add_edge(Edge(n1_a, n1_b)) - df.add_edge(Edge(n1_b, ct)) - - # df.add_edge(Edge(n0, n3_a)) - df.add_edge(Edge(n3_a, n3_b)) - df.add_edge(Edge(n3_b, ct)) - - # df.add_edge(Edge(n0, n5_a)) - df.add_edge(Edge(n5_a, n5_b, if_conditional=True)) - df.add_edge(Edge(n5_a, n5_c, if_conditional=False)) - df.add_edge(Edge(n5_b, ct)) - df.add_edge(Edge(n5_c, ct)) - - # df.add_edge(Edge(n0, n7)) - df.add_edge(Edge(n7, ct)) - - df.entry = [n1_a, n3_a, n5_a, n7] - return df - -frontend_op.dataflow = frontend_df_parallel() - diff --git a/deathstar_movie_review/entities/movie.py b/deathstar_movie_review/entities/movie.py deleted file mode 100644 index ade4d19..0000000 --- a/deathstar_movie_review/entities/movie.py +++ /dev/null @@ -1,72 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator -from deathstar_movie_review.entities.compose_review import ComposeReview -from deathstar_movie_review.entities.user import User - - -class MovieId: - # key: 'title' - def __init__(self, title: str, movie_id: str): - self.title = title - self.movie_id = movie_id - - def upload_movie(self, review: ComposeReview, rating: int): - if self.movie_id is not None: - review.upload_movie_id(self.movie_id) - else: - review.upload_rating(rating) - - -def upload_movie_compiled_cond_0(variable_map: dict[str, Any], state: MovieId) -> Any: - variable_map["movie_id"] = state.movie_id # SSA - return variable_map["movie_id"] is not None - -movie_id_op = StatefulOperator( - MovieId, - { - "upload_movie_cond": upload_movie_compiled_cond_0 - }, - {} -) - -def upload_movie_df(): - df = DataFlow("movieId_upload_movie") - n0 = OpNode(MovieId, InvokeMethod("upload_movie_cond"), read_key_from="title", is_conditional=True) - n1 = OpNode(ComposeReview, InvokeMethod("upload_movie_id"), read_key_from="review") - n2 = OpNode(ComposeReview, InvokeMethod("upload_rating"), read_key_from="review") - - df.add_edge(Edge(n0, n1, if_conditional=True)) - df.add_edge(Edge(n0, n2, if_conditional=False)) - df.entry = n0 - return df - -movie_id_op.dataflows["upload_movie"] = upload_movie_df() - - - -### Other movie-related operators - -# key: movie_id - -class Plot: - def __init__(self, movie_id: str, plot: str): - self.movie_id = movie_id - self.plot = plot - -class MovieInfo: - def __init__(self, movie_id: str, info: dict): - self.movie_id = movie_id - self.info = info - -movie_info_op = StatefulOperator( - MovieInfo, - {}, - {} -) - -plot_op = StatefulOperator( - Plot, - {}, - {} -) \ No newline at end of file diff --git a/deathstar_movie_review/entities/user.py b/deathstar_movie_review/entities/user.py deleted file mode 100644 index e883277..0000000 --- a/deathstar_movie_review/entities/user.py +++ /dev/null @@ -1,36 +0,0 @@ -from typing import Any -from deathstar_movie_review.entities.compose_review import ComposeReview -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator - - -class User: - def __init__(self, username: str, user_data: dict): - self.username = username - self.user_data = user_data - - def upload_user(self, review: ComposeReview): - review.upload_user_id(self.user_data["userId"]) - - -def upload_user_compiled_0(variable_map: dict[str, Any], state: User) -> Any: - variable_map["user_id"] = state.user_data["userId"] - -user_op = StatefulOperator( - User, - { - "upload_user_compiled_0": upload_user_compiled_0, - }, - {} -) - -def upload_df(): - df = DataFlow("user_upload_user") - n0 = OpNode(User, InvokeMethod("upload_user_compiled_0"), read_key_from="username") - n1 = OpNode(ComposeReview, InvokeMethod("upload_user_id"), read_key_from="review") - - df.add_edge(Edge(n0, n1)) - df.entry = n0 - return df - -user_op.dataflows["upload_user"] = upload_df() \ No newline at end of file diff --git a/deathstar_movie_review/start_benchmark.py b/deathstar_movie_review/start_benchmark.py index 7664b86..270c9e2 100644 --- a/deathstar_movie_review/start_benchmark.py +++ b/deathstar_movie_review/start_benchmark.py @@ -1,8 +1,12 @@ import hashlib +from multiprocessing import Pool import time +from typing import Literal import uuid import pandas as pd import random + + from .movie_data import movie_data from .workload_data import movie_titles, charset import sys @@ -13,12 +17,11 @@ # import cascade sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) -from cascade.dataflow.dataflow import Event, EventResult, InitClass, OpNode +from tests.integration.flink.utils import init_cascade_from_module, init_flink_runtime +import cascade +from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.dataflow import DataflowRef,EventResult from cascade.runtime.flink_runtime import FlinkClientSync - -from .entities.user import User -from .entities.frontend import frontend_op -from .entities.movie import MovieInfo, Plot, MovieId IN_TOPIC = "ds-movie-in" OUT_TOPIC = "ds-movie-out" @@ -30,7 +33,7 @@ # bursts = 100 def populate_user(client: FlinkClientSync): - init_user = OpNode(User, InitClass(), read_key_from="username") + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] for i in range(1000): user_id = f'user{i}' username = f'username_{i}' @@ -50,69 +53,77 @@ def populate_user(client: FlinkClientSync): "Password": password_hash, "Salt": salt } - event = Event(init_user, {"username": username, "user_data": user_data}, None) + event = user_init.generate_event({"username": username, "user_data": user_data}, key=username) client.send(event) def populate_movie(client: FlinkClientSync): - init_movie_info = OpNode(MovieInfo, InitClass(), read_key_from="movie_id") - init_plot = OpNode(Plot, InitClass(), read_key_from="movie_id") - init_movie_id = OpNode(MovieId, InitClass(), read_key_from="title") - + movieinfo_init = cascade.core.dataflows[DataflowRef("MovieInfo", "__init__")] + plot_init = cascade.core.dataflows[DataflowRef("Plot", "__init__")] + movieid_init = cascade.core.dataflows[DataflowRef("MovieId", "__init__")] + for movie in movie_data: movie_id = movie["MovieId"] # movie info -> write `movie` - event = Event(init_movie_info, {"movie_id": movie_id, "info": movie}, None) + event = movieinfo_init.generate_event({"movie_id": movie_id, "info": movie}, key=movie_id) client.send(event) # plot -> write "plot" - event = Event(init_plot, {"movie_id": movie_id, "plot": "plot"}, None) + event = plot_init.generate_event({"movie_id": movie_id, "plot": "plot"}, key=movie_id) client.send(event) # movie_id_op -> register movie id - event = Event(init_movie_id, {"title": movie["Title"], "movie_id": movie_id}, None) + event = movieid_init.generate_event({"title": movie["Title"], "movie_id": movie_id}, key=movie["Title"]) client.send(event) -def compose_review(req_id): +def compose_review(req_id, parallel=False): user_index = random.randint(0, 999) username = f"username_{user_index}" password = f"password_{user_index}" title = random.choice(movie_titles) - rating = random.randint(0, 10) + rating = None + # rating = random.randint(0, 10) text = ''.join(random.choice(charset) for _ in range(256)) - return frontend_op.dataflow.generate_event({ - "review": req_id, - "user": username, - "title": title, - "rating": rating, - "text": text + if parallel: + compose = cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] + else: + compose = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + + return compose.generate_event({ + "req_id": req_id, # hacky way to create the compose review object when it doesn't exist + "review_0": req_id, + "user_0": username, + "title_0": title, + "rating_0": rating, + "text_0": text }) -def deathstar_workload_generator(): +def deathstar_workload_generator(parallel=False): c = 1 while True: - yield compose_review(c) + yield compose_review(c, parallel) c += 1 -def benchmark_runner(proc_num, messages_per_burst, sleeps_per_burst, sleep_time, seconds_per_burst, bursts) -> dict[int, dict]: +def benchmark_runner(args) -> dict[int, dict]: + proc_num, requests_per_second, sleep_time, bursts, parallel = args print(f'Generator: {proc_num} starting') client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) - deathstar_generator = deathstar_workload_generator() + deathstar_generator = deathstar_workload_generator(parallel) start = timer() for b in range(bursts): sec_start = timer() # send burst of messages - for i in range(messages_per_burst): + for i in range(requests_per_second): # sleep sometimes between messages - if i % (messages_per_burst // sleeps_per_burst) == 0: - time.sleep(sleep_time) + # if i % (messages_per_burst // sleeps_per_burst) == 0: + time.sleep(sleep_time) event = next(deathstar_generator) client.send(event) @@ -121,13 +132,16 @@ def benchmark_runner(proc_num, messages_per_burst, sleeps_per_burst, sleep_time, # wait out the second lps = sec_end - sec_start - if lps < seconds_per_burst: + if lps < 1: time.sleep(1 - lps) sec_end2 = timer() print(f'Latency per burst: {sec_end2 - sec_start} ({b+1}/{bursts})') end = timer() - print(f'Average latency per burst: {(end - start) / bursts} ({seconds_per_burst})') + avg_send_latency = (end - start) / bursts + print(f'Average send latency per burst for generator {proc_num} was: {avg_send_latency}') + if avg_send_latency > 1.1: + print(f'This is higher than expected (1). Maybe increase the number of threads?') futures = wait_for_futures(client) client.close() return futures @@ -179,38 +193,60 @@ def write_dict_to_pkl(futures_dict, filename): # Multiply flink_time by 1000 to convert to milliseconds df['flink_time'] = df['flink_time'] * 1000 - df.to_pickle(filename) return df def main(): parser = argparse.ArgumentParser(description="Run the benchmark and save results.") parser.add_argument("-o", "--output", type=str, default="benchmark_results.pkl", help="Output file name for the results") - parser.add_argument("--messages_per_burst", type=int, default=10, help="Number of messages per burst") - parser.add_argument("--sleeps_per_burst", type=int, default=10, help="Number of sleep cycles per burst") - parser.add_argument("--sleep_time", type=float, default=0.08, help="Sleep time between messages") - parser.add_argument("--seconds_per_burst", type=int, default=1, help="Seconds per burst") - parser.add_argument("--bursts", type=int, default=100, help="Number of bursts") + parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") + parser.add_argument("--seconds", type=int, default=100, help="Number of seconds to benchmark for") + parser.add_argument("--threads", type=int, default=1, help="Number of concurrent threads") + parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") + parser.add_argument("--no_init", action="store_true", help="Don't populate") args = parser.parse_args() + rps_per_thread = int(args.requests_per_second / args.threads) + sleep_time = 0.95 / rps_per_thread + + EXPERIMENT = args.experiment + + print(f"Experiment [{EXPERIMENT}]") print(f"Starting with args:\n{args}") + print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") + + init_cascade_from_module("deathstar_movie_review.entities.entities") init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) - print("Populating...") - populate_user(init_client) - populate_movie(init_client) - init_client.producer.flush() - wait_for_futures(init_client) - print("Done.") - time.sleep(1) + df_baseline = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + print(df_baseline.to_dot()) + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + print(cascade.core.dataflows.keys()) + + for df in cascade.core.dataflows.values(): + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + + if not args.no_init: + print("Populating...") + populate_user(init_client) + populate_movie(init_client) + init_client.producer.flush() + wait_for_futures(init_client) + print("Done.") + time.sleep(1) print("Starting benchmark") + parallel = args.experiment == "parallel" - # with Pool(threads) as p: - # results = p.map(benchmark_runner, range(threads)) + func_args = [(t, rps_per_thread, sleep_time, args.seconds, parallel) for t in range(args.threads)] + with Pool(args.threads) as p: + results = p.map(benchmark_runner, func_args) - # results = {k: v for d in results for k, v in d.items()} - results = benchmark_runner(0, args.messages_per_burst, args.sleeps_per_burst, args.sleep_time, args.seconds_per_burst, args.bursts) + results = {k: v for d in results for k, v in d.items()} print("last result:") print(list(results.values())[-1]) @@ -218,7 +254,6 @@ def main(): r = 0 for result in results.values(): if result["ret"] is not None: - # print(result) r += 1 print(f"{r}/{t} results recieved.") @@ -233,5 +268,50 @@ def main(): print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") init_client.close() + df = preprocess(args.output, df) + df.to_pickle(args.output) + + +import re + +def preprocess(name, df, warmup_time_s=3) -> pd.DataFrame: + # Extract parallelism and mps from the name using regex + match = re.search(r'(.+)_p-(\d+)_rps-(\d+)', name) + if match: + experiment = match.group(1) + parallelism = int(match.group(2)) + mps = int(match.group(3)) + else: + raise Exception() + + # Ignore the first warmup_time seconds of events + warmup_events = int(warmup_time_s * mps) + df = df.iloc[warmup_events:] + + # Calculate the additional Kafka overhead + # df['kafka_overhead'] = df['latency'] - df['flink_time'] + + # Extract median values from df + flink_time_median = df['flink_time'].median() + latency_median = df['latency'].median() + flink_time_99_percentile = df['flink_time'].quantile(0.99) + latency_99_percentile = df['latency'].quantile(0.99) + flink_time_95_percentile = df['flink_time'].quantile(0.95) + latency_95_percentile = df['latency'].quantile(0.95) + + data = { + 'experiment': experiment, + 'parallelism': parallelism, + 'mps': mps, + 'flink_time_median': flink_time_median, + 'latency_median': latency_median, + 'latency_99_percentile': latency_99_percentile, + 'latency_95_percentile': latency_95_percentile, + 'flink_time_99_percentile': flink_time_99_percentile, + 'flink_time_95_percentile': flink_time_95_percentile + } + data = {k:[v] for k,v in data.items()} + return pd.DataFrame(data) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/deathstar_movie_review/start_prefetch_experiment.py b/deathstar_movie_review/start_prefetch_experiment.py new file mode 100644 index 0000000..96d2b6f --- /dev/null +++ b/deathstar_movie_review/start_prefetch_experiment.py @@ -0,0 +1,308 @@ +from collections import Counter +import hashlib +from multiprocessing import Pool +import time +from typing import Literal, Optional +import uuid +import pandas as pd +import random + + +from .movie_data import movie_data +from .workload_data import movie_titles, charset +import sys +import os +from timeit import default_timer as timer +import argparse + +# import cascade +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) + +from tests.integration.flink.utils import init_cascade_from_module, init_flink_runtime +import cascade +from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.dataflow import DataflowRef,EventResult +from cascade.runtime.flink_runtime import FlinkClientSync + +IN_TOPIC = "ds-movie-in" +OUT_TOPIC = "ds-movie-out" +# threads = 1 +# messages_per_burst = 10 +# sleeps_per_burst = 10 +# sleep_time = 0.08 +# seconds_per_burst = 1 +# bursts = 100 + +def populate_compose_review(client: FlinkClientSync): + cr_init = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")] + for i in range(1000): + event = cr_init.generate_event({"req_id": str(i)}, key=str(i)) + client.send(event) + + +def populate_movie(client: FlinkClientSync): + movieinfo_init = cascade.core.dataflows[DataflowRef("MovieInfo", "__init__")] + plot_init = cascade.core.dataflows[DataflowRef("Plot", "__init__")] + movieid_init = cascade.core.dataflows[DataflowRef("MovieId", "__init__")] + + for movie in movie_data: + movie_id = movie["MovieId"] + + # movie info -> write `movie` + event = movieinfo_init.generate_event({"movie_id": movie_id, "info": movie}, key=movie_id) + client.send(event) + + # plot -> write "plot" + event = plot_init.generate_event({"movie_id": movie_id, "plot": "plot"}, key=movie_id) + client.send(event) + + # movie_id_op -> register movie id + event = movieid_init.generate_event({"title": movie["Title"], "movie_id": movie_id}, key=movie["Title"]) + client.send(event) + + +def upload_movie(rating_chance: float, prefetch=False): + assert 0 <= rating_chance <= 1 + + if random.random() < rating_chance: + rating = random.randint(0, 10) + else: + rating = None + title = random.choice(movie_titles) + req_id = random.randint(0, 999) + + if prefetch: + movie_id = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] + else: + movie_id = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie")] + + return movie_id.generate_event({ + "review_0": str(req_id), + "rating_0": rating + }, key=title) + +def deathstar_workload_generator(rating_chance: float, prefetch=False): + c = 1 + while True: + yield upload_movie(rating_chance, prefetch) + c += 1 + + +def benchmark_runner(args) -> dict[int, dict]: + proc_num, requests_per_second, sleep_time, bursts, prefetch, rating_chance = args + print(f'Generator: {proc_num} starting') + client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + deathstar_generator = deathstar_workload_generator(rating_chance, prefetch) + start = timer() + + for b in range(bursts): + sec_start = timer() + + # send burst of messages + for i in range(requests_per_second): + + # sleep sometimes between messages + # if i % (messages_per_burst // sleeps_per_burst) == 0: + time.sleep(sleep_time) + event = next(deathstar_generator) + client.send(event) + + client.flush() + sec_end = timer() + + # wait out the second + lps = sec_end - sec_start + if lps < 1: + time.sleep(1 - lps) + sec_end2 = timer() + print(f'Latency per burst: {sec_end2 - sec_start} ({b+1}/{bursts})') + + end = timer() + avg_send_latency = (end - start) / bursts + print(f'Average send latency per burst for generator {proc_num} was: {avg_send_latency}') + if avg_send_latency > 1.1: + print(f'This is higher than expected (1). Maybe increase the number of threads?') + futures = wait_for_futures(client) + client.close() + return futures + +def wait_for_futures(client: FlinkClientSync): + done = False + while not done: + num_done = 0 + done = True + for event_id, fut in client._futures.items(): + result = fut["ret"] + if result is None: + done = False + else: + num_done += 1 + + if not done: + print(f"{num_done}/{len(client._futures)}") + time.sleep(0.5) + futures = client._futures + return futures + + +def write_dict_to_pkl(futures_dict, filename): + """ + Writes a dictionary of event data to a pickle file. + + Args: + futures_dict (dict): A dictionary where each key is an event ID and the value is another dict. + filename (str): The name of the pickle file to write to. + """ + + # Prepare the data for the DataFrame + data = [] + for event_id, event_data in futures_dict.items(): + ret: EventResult = event_data.get("ret") + row = { + "event_id": event_id, + "sent": str(event_data.get("sent")), + "sent_t": event_data.get("sent_t"), + "ret": str(event_data.get("ret")), + "ret_t": event_data.get("ret_t"), + "roundtrip": ret.metadata["roundtrip"] if ret else None, + "flink_time": ret.metadata["flink_time"] if ret else None, + "deser_times": ret.metadata["deser_times"] if ret else None, + "loops": ret.metadata["loops"] if ret else None, + "latency": event_data["ret_t"][1] - event_data["sent_t"][1] if ret else None + } + data.append(row) + + # Create a DataFrame and save it as a pickle file + df = pd.DataFrame(data) + + # Multiply flink_time by 1000 to convert to milliseconds + df['flink_time'] = df['flink_time'] * 1000 + + return df +import logging +def main(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + parser = argparse.ArgumentParser(description="Run the benchmark and save results.") + parser.add_argument("-o", "--output", type=str, default="benchmark_results.pkl", help="Output file name for the results") + parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") + parser.add_argument("--seconds", type=int, default=100, help="Number of seconds to benchmark for") + parser.add_argument("--threads", type=int, default=1, help="Number of concurrent threads") + parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") + parser.add_argument("--branch_chance", type=float, default=0.5, help="Brance chance") + parser.add_argument("--no_init", action="store_true", help="Don't populate") + args = parser.parse_args() + + rps_per_thread = int(args.requests_per_second / args.threads) + sleep_time = 0.95 / rps_per_thread + + EXPERIMENT = args.experiment + + print(f"Experiment [{EXPERIMENT}]") + print(f"Starting with args:\n{args}") + print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") + + init_cascade_from_module("deathstar_movie_review.entities.entities") + + init_client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + + # for prefetch experiment + df_baseline = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "upload_movie_prefetch_parallel" + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = df_parallel + + for df in cascade.core.dataflows.values(): + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + + if not args.no_init: + print("Populating...") + populate_compose_review(init_client) + populate_movie(init_client) + init_client.producer.flush() + wait_for_futures(init_client) + print("Done.") + time.sleep(1) + + print("Starting benchmark") + prefetch = args.experiment == "prefetch" + + func_args = [(t, rps_per_thread, sleep_time, args.seconds, prefetch, args.branch_chance) for t in range(args.threads)] + with Pool(args.threads) as p: + results = p.map(benchmark_runner, func_args) + + results = {k: v for d in results for k, v in d.items()} + + print("last result:") + print(list(results.values())[-1]) + t = len(results) + r = 0 + for result in results.values(): + if result["ret"] is not None: + r += 1 + + print(f"{r}/{t} results recieved.") + print(f"Writing results to {args.output}") + + count = Counter([r["ret"].result for r in results.values()]) + print(count) + + df = write_dict_to_pkl(results, args.output) + + flink_time = df['flink_time'].median() + latency = df['latency'].median() + flink_prct = float(flink_time) * 100 / latency + print(f"Median latency : {latency:.2f} ms") + print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") + init_client.close() + + df = preprocess(args.output, df) + df.to_pickle(args.output) + + +import re + +def preprocess(name, df, warmup_time_s=3) -> pd.DataFrame: + # Extract parallelism and mps from the name using regex + match = re.search(r'(.+)_p-(\d+)_rps-(\d+)', name) + if match: + experiment = match.group(1) + parallelism = int(match.group(2)) + mps = int(match.group(3)) + else: + raise Exception() + + # Ignore the first warmup_time seconds of events + warmup_events = int(warmup_time_s * mps) + df = df.iloc[warmup_events:] + + # Calculate the additional Kafka overhead + # df['kafka_overhead'] = df['latency'] - df['flink_time'] + + # Extract median values from df + flink_time_median = df['flink_time'].median() + latency_median = df['latency'].median() + flink_time_99_percentile = df['flink_time'].quantile(0.99) + latency_99_percentile = df['latency'].quantile(0.99) + flink_time_95_percentile = df['flink_time'].quantile(0.95) + latency_95_percentile = df['latency'].quantile(0.95) + + data = { + 'experiment': experiment, + 'parallelism': parallelism, + 'mps': mps, + 'flink_time_median': flink_time_median, + 'latency_median': latency_median, + 'latency_99_percentile': latency_99_percentile, + 'latency_95_percentile': latency_95_percentile, + 'flink_time_99_percentile': flink_time_99_percentile, + 'flink_time_95_percentile': flink_time_95_percentile + } + data = {k:[v] for k,v in data.items()} + return pd.DataFrame(data) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/deathstar_movie_review/test_movie_review_demo.py b/deathstar_movie_review/test_movie_review_demo.py index 27cc6f7..f11b59b 100644 --- a/deathstar_movie_review/test_movie_review_demo.py +++ b/deathstar_movie_review/test_movie_review_demo.py @@ -1,40 +1,82 @@ -from cascade.dataflow.dataflow import Event, InitClass, InvokeMethod, OpNode -from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination +import logging +import sys +import os + + + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) + +from cascade.runtime.flink_runtime import FlinkClientSync +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize_until_if +from cascade.dataflow.operator import StatefulOperator, StatelessOperator from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime -from deathstar_movie_review.entities.compose_review import ComposeReview, compose_review_op -from deathstar_movie_review.entities.user import User, user_op -from deathstar_movie_review.entities.movie import MovieId, movie_id_op, movie_info_op, plot_op -from deathstar_movie_review.entities.frontend import frontend_op, text_op, unique_id_op, frontend_df_serial +import cascade +import pytest +import tests.integration.flink.utils as utils + +def init_python_runtime() -> tuple[PythonRuntime, PythonClientSync]: + runtime = PythonRuntime() + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.run() + return runtime, PythonClientSync(runtime) def test_deathstar_movie_demo_python(): print("starting") - runtime = PythonRuntime() - - # make sure we're running the serial version - prev_df = frontend_op.dataflow - frontend_op.dataflow = frontend_df_serial() + cascade.core.clear() + exec(f'import deathstar_movie_review.entities.entities') + cascade.core.init() + + compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel, _ = parallelize_until_if(compose_df) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 4 + + runtime, client = init_python_runtime() + deathstar_movie_demo(client) + +@pytest.mark.integration +def test_deathstar_movie_demo_flink(): + print("starting") + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + utils.create_topics() - print(frontend_op.dataflow.to_dot()) - dead_node_elimination([], [frontend_op]) - print(frontend_op.dataflow.to_dot()) + runtime = utils.init_flink_runtime("deathstar_movie_review.entities.entities") + compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + df_parallel, _ = parallelize_until_if(compose_df) + df_parallel.name = "compose_parallel" + cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] = df_parallel + runtime.add_dataflow(df_parallel) + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 4 - runtime.add_operator(compose_review_op) - runtime.add_operator(user_op) - runtime.add_operator(movie_info_op) - runtime.add_operator(movie_id_op) - runtime.add_operator(plot_op) - runtime.add_stateless_operator(frontend_op) - runtime.add_stateless_operator(unique_id_op) - runtime.add_stateless_operator(text_op) - runtime.run() - client = PythonClientSync(runtime) + client = FlinkClientSync() + runtime.run(run_async=True) - init_user = OpNode(User, InitClass(), read_key_from="username") - username = "username_1" + try: + deathstar_movie_demo(client) + finally: + client.close() + +def deathstar_movie_demo(client): + compose_df = cascade.core.dataflows[DataflowRef("Frontend", "compose")] + + for df in cascade.core.dataflows.values(): + print(df.to_dot()) + + username = "myUsername" user_data = { "userId": "user1", "FirstName": "firstname", @@ -43,27 +85,27 @@ def test_deathstar_movie_demo_python(): "Password": "****", "Salt": "salt" } + print("testing user create") - event = Event(init_user, {"username": username, "user_data": user_data}, None) - result = client.send(event) - assert isinstance(result, User) and result.username == username + + event = cascade.core.dataflows[DataflowRef("User", "__init__")].generate_event({"username": username, "user_data": user_data}, username) + result = client.send(event, block=True) + print(result) + assert result['username'] == username print("testing compose review") - req_id = 1 + req_id = "4242" movie_title = "Cars 2" movie_id = 1 # make the review - init_compose_review = OpNode(ComposeReview, InitClass(), read_key_from="req_id") - event = Event(init_compose_review, {"req_id": req_id}, None) - result = client.send(event) + event = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")].generate_event({"req_id": req_id}, req_id) + result = client.send(event, block=True) print("review made") - - # make the movie - init_movie = OpNode(MovieId, InitClass(), read_key_from="title") - event = Event(init_movie, {"title": movie_title, "movie_id": movie_id}, None) - result = client.send(event) + # # make the movie + event = cascade.core.dataflows[DataflowRef("MovieId", "__init__")].generate_event({"title": movie_title, "movie_id": movie_id}, movie_title) + result = client.send(event, block=True) print("movie made") # compose the review @@ -71,39 +113,123 @@ def test_deathstar_movie_demo_python(): "review": req_id, "user": username, "title": movie_title, - "rating": 5, + "rating": None, "text": "good movie!" } - event = Event( - frontend_op.dataflow.entry, - review_data, - frontend_op.dataflow) - result = client.send(event) + r_data = {r+"_0": v for r, v in review_data.items()} + + event = compose_df.generate_event(r_data) + result = client.send(event, block=True) print(result) print("review composed") - # read the review - get_review = OpNode(ComposeReview, InvokeMethod("get_data"), read_key_from="req_id") - event = Event( - get_review, - {"req_id": req_id}, - None - ) - result = client.send(event) + event = cascade.core.dataflows[DataflowRef("ComposeReview", "get_data")].generate_event({"req_id": req_id}, req_id) + result = client.send(event, block=True) + print(result) + expected = { "userId": user_data["userId"], "movieId": movie_id, "text": review_data["text"] } - print(result, expected) + assert "review_id" in result del result["review_id"] # randomly generated assert result == expected - print("Success!") - # put the df back - frontend_op.dataflow = prev_df - \ No newline at end of file + + ### PARALLEL ### + df_parallel = cascade.core.dataflows[DataflowRef("Frontend", "compose_parallel")] + + + # make the review + new_req_id = "43" + event = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")].generate_event({"req_id": new_req_id}, new_req_id) + result = client.send(event, block=True) + print("review made (parallel)") + + # compose the review + review_data = { + "review": req_id, + "user": username, + "title": movie_title, + "rating": None, + "text": "bad movie!" + } + + r_data = {r+"_0": v for r, v in review_data.items()} + + event = df_parallel.generate_event(r_data) + result = client.send(event, block=True) + print(result) + print("review composed (parallel)") + + event = cascade.core.dataflows[DataflowRef("ComposeReview", "get_data")].generate_event({"req_id": req_id}, req_id) + result = client.send(event, block=True) + print(result) + + expected = { + "userId": user_data["userId"], + "movieId": movie_id, + "text": "bad movie!" + } + + assert "review_id" in result + del result["review_id"] # randomly generated + assert result == expected + + +@pytest.mark.integration +def test_deathstar_movie_demo_prefetch_flink(): + print("starting") + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + + + runtime = utils.init_flink_runtime("deathstar_movie_review.entities.entities") + + # for prefetch experiment + df_baseline = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + df_parallel, _ = parallelize_until_if(df_baseline) + df_parallel.name = "upload_movie_prefetch_parallel" + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = df_parallel + + runtime.add_dataflow(df_parallel) + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 2 + + + client = FlinkClientSync() + runtime.run(run_async=True) + + try: + deathstar_prefetch(client) + finally: + client.close() + +def deathstar_prefetch(client): + event = cascade.core.dataflows[DataflowRef("MovieId", "__init__")].generate_event({"title": "cars", "movie_id": 1}, "cars") + result = client.send(event, block=True) + print("movie made") + + + # make the review + event = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")].generate_event({"req_id": "100"}, "100") + result = client.send(event, block=True) + print("review made") + + + event = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie")].generate_event({"review_0": "100", "rating_0": 3}, "cars") + result = client.send(event, block=True) + print("movie uploaded") + + event = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")].generate_event({"review_0": "100", "rating_0": 3}, "cars") + result = client.send(event, block=True) + print("movie uploaded w/ prefetch") + print(result) \ No newline at end of file diff --git a/docker-compose.kafka.yml b/docker-compose.kafka.yml new file mode 100644 index 0000000..33335f5 --- /dev/null +++ b/docker-compose.kafka.yml @@ -0,0 +1,48 @@ +version: '3.1' + +# https://docs.docker.com/guides/kafka/ + +services: + kafka: + image: apache/kafka-native + ports: + - "9092:9092" # for HOST connections + expose: + - "9093" # for DOCKER connections + environment: + # Configure listeners for both docker and host communication + KAFKA_LISTENERS: CONTROLLER://localhost:9091,HOST://0.0.0.0:9092,DOCKER://0.0.0.0:9093 + KAFKA_ADVERTISED_LISTENERS: HOST://localhost:9092,DOCKER://kafka:9093 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,DOCKER:PLAINTEXT,HOST:PLAINTEXT + + # Settings required for KRaft mode + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9091 + KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER + + # Required for a single node cluster + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + + # Low Latency Tuning + KAFKA_NUM_NETWORK_THREADS: 8 + KAFKA_NUM_IO_THREADS: 16 + KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 + KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 + KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 + KAFKA_SOCKET_RECEIVE_BUFFER_BYTES: 102400 + + # Change timestamp type for benchmark measurements + KAFKA_LOG_MESSAGE_TIMESTAMP_TYPE: LogAppendTime + + kafka-ui: + image: ghcr.io/kafbat/kafka-ui:latest + ports: + - 8080:8080 + environment: + DYNAMIC_CONFIG_ENABLED: "true" + KAFKA_CLUSTERS_0_NAME: local + KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9093 + depends_on: + - kafka \ No newline at end of file diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml new file mode 100644 index 0000000..3566235 --- /dev/null +++ b/docker-compose.monitoring.yml @@ -0,0 +1,114 @@ +version: '3.1' + +# https://docs.docker.com/guides/kafka/ + +services: + kafka: + image: apache/kafka-native + ports: + - "9092:9092" # for HOST connections + expose: + - "9093" # for DOCKER connections + environment: + # Configure listeners for both docker and host communication + KAFKA_LISTENERS: CONTROLLER://localhost:9091,HOST://0.0.0.0:9092,DOCKER://0.0.0.0:9093 + KAFKA_ADVERTISED_LISTENERS: HOST://localhost:9092,DOCKER://kafka:9093 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,DOCKER:PLAINTEXT,HOST:PLAINTEXT + + # Settings required for KRaft mode + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9091 + KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER + + # Required for a single node cluster + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + + # Low Latency Tuning + KAFKA_NUM_NETWORK_THREADS: 16 + KAFKA_NUM_IO_THREADS: 32 + KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 + KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 + KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 + KAFKA_SOCKET_RECEIVE_BUFFER_BYTES: 102400 + + # Change timestamp type for benchmark measurements + KAFKA_LOG_MESSAGE_TIMESTAMP_TYPE: LogAppendTime + + kafka-ui: + image: ghcr.io/kafbat/kafka-ui:latest + ports: + - 8080:8080 + environment: + DYNAMIC_CONFIG_ENABLED: "true" + KAFKA_CLUSTERS_0_NAME: local + KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9093 + depends_on: + - kafka + + # https://nightlies.apache.org/flink/flink-docs-release-1.20/docs/deployment/resource-providers/standalone/docker/#flink-with-docker-compose + + jobmanager: + build: + context: . + dockerfile: Dockerfile.pyflink + ports: + - "8081:8081" + expose: + - "9250" # Metrics port + command: jobmanager + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + metrics.reporter.prom.port: 9250 + metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory + + taskmanager: + build: + context: . + dockerfile: Dockerfile.pyflink + expose: + - "9250" # Metrics port + depends_on: + - jobmanager + command: taskmanager + scale: 1 + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + metrics.reporter.prom.port: 9250 + metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory + + + # Monitoring stack + prometheus: + image: prom/prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + depends_on: + - jobmanager + - taskmanager + + grafana: + image: grafana/grafana + ports: + - "3000:3000" + volumes: + - grafana-storage:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + depends_on: + - prometheus + +volumes: + grafana-storage: \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 22b5bb8..bac450e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,13 +20,19 @@ services: KAFKA_PROCESS_ROLES: broker,controller KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:9091 - - # Listener to use for broker-to-broker communication KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER # Required for a single node cluster KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + # Low Latency Tuning + KAFKA_NUM_NETWORK_THREADS: 8 + KAFKA_NUM_IO_THREADS: 16 + KAFKA_LOG_FLUSH_INTERVAL_MESSAGES: 1000 + KAFKA_LOG_FLUSH_INTERVAL_MS: 1000 + KAFKA_SOCKET_SEND_BUFFER_BYTES: 1024000 + KAFKA_SOCKET_RECEIVE_BUFFER_BYTES: 102400 + # Change timestamp type for benchmark measurements KAFKA_LOG_MESSAGE_TIMESTAMP_TYPE: LogAppendTime @@ -53,7 +59,7 @@ services: environment: - | FLINK_PROPERTIES= - jobmanager.rpc.address: jobmanager + jobmanager.rpc.address: jobmanager taskmanager: build: @@ -67,4 +73,12 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} \ No newline at end of file + taskmanager.numberOfTaskSlots: ${TASK_SLOTS:-16} + + deploy: + resources: + limits: + cpus: "4" + memory: "8G" + mem_limit: 8G + cpus: "4" \ No newline at end of file diff --git a/experiments/dynamic_prefetching/entities.py b/experiments/dynamic_prefetching/entities.py new file mode 100644 index 0000000..2bd4e73 --- /dev/null +++ b/experiments/dynamic_prefetching/entities.py @@ -0,0 +1,34 @@ +from cascade import cascade +import random +import time + +@cascade(globals={'time': time}) +class Oracle(): + @staticmethod + def get() -> int: + time.sleep(0.01) + return 42 + +@cascade(globals={'random': random}) +class Prefetcher: + @staticmethod + def prefetch(branch_chance: float): + prefetched_value = Oracle.get() + and_also = Oracle.get() + rand = random.random() + cond = rand < branch_chance + if cond: + return prefetched_value + else: + return -42 + + @staticmethod + def baseline(branch_chance: float): + and_also = Oracle.get() + cond = random.random() < branch_chance + if cond: + value = Oracle.get() + return value + else: + return -42 + diff --git a/experiments/dynamic_prefetching/run_experiments.py b/experiments/dynamic_prefetching/run_experiments.py new file mode 100644 index 0000000..3e64229 --- /dev/null +++ b/experiments/dynamic_prefetching/run_experiments.py @@ -0,0 +1,58 @@ +import subprocess +import time + + +# Define experiment parameters as a list of dictionaries +experiments = [ + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.9}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.9}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.5}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.5}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "baseline", "chance": 0.1}}, + {"parallelism": 4, "benchmark_args": {"requests_per_second": 1000, "seconds": 30, "threads": 20, "experiment": "prefetch", "chance": 0.1}}, +] + + + + +print("Tearing down docker containers") +subprocess.run(["docker", "compose", "down"], check=False) + +for exp in experiments: + print(f"Starting experiment {exp}") + + # Start docker compose + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}", "--force-recreate"], check=True, env={ + "TASK_SLOTS": "1" + }) + + time.sleep(10) + + # Run Flink job + + flink_cmd = [ + "flink", "run", "--pyFiles", "/home/lvanmol/cascade/src,/home/lvanmol/cascade", + "--pyModule", "experiments.dynamic_prefetching.submit_job", "-d", "-p", str(exp['parallelism']) + ] + subprocess.run(flink_cmd, check=True) + + # Start benchmark + # filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['requests_per_second']}.pkl" + benchmark_cmd = [ + "python", "-u", "-m", "experiments.dynamic_prefetching.run_prefetcher", + ] + + for arg, val in exp['benchmark_args'].items(): + benchmark_cmd.append(f"--{arg}") + benchmark_cmd.append(str(val)) + subprocess.run(benchmark_cmd, check=True) + + # Sleep for experiment duration + # print(f"Sleeping for {exp['sleep']} seconds...") + # time.sleep(exp['sleep']) + + # Stop docker compose + subprocess.run(["docker", "compose", "down"], check=False) + + print(f"Experiment completed.") + diff --git a/experiments/dynamic_prefetching/run_prefetcher.py b/experiments/dynamic_prefetching/run_prefetcher.py new file mode 100644 index 0000000..91adad0 --- /dev/null +++ b/experiments/dynamic_prefetching/run_prefetcher.py @@ -0,0 +1,176 @@ +import argparse +import logging +from multiprocessing import Pool +import sys +import os +from typing import Counter, Literal +import pandas as pd + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) + +import cascade +from cascade.dataflow.optimization.parallelization import parallelize +from cascade.runtime.flink_runtime import FlinkClientSync +from cascade.dataflow.dataflow import DataFlow, DataflowRef, EventResult +from tests.integration.flink.utils import create_topics, init_cascade_from_module, init_flink_runtime, wait_for_event_id +from timeit import default_timer as timer + + +KAFKA_BROKER = "localhost:9092" +KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different + +IN_TOPIC = "prefetcher-in" +OUT_TOPIC = "prefetcher-out" +INTERNAL_TOPIC = "prefetcher-internal" + +def main(): + init_cascade_from_module("experiments.dynamic_prefetching.entities") + + + + # logger = logging.getLogger("cascade") + # logger.setLevel("DEBUG") + # runtime = init_flink_runtime("experiments.dynamic_prefetching.entities", parallelism=4) + + print(cascade.core.dataflows.keys()) + + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] + prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] + + + pre_par = parallelize(prefetch) + cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = pre_par + + base_par = parallelize(baseline) + cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] = base_par + + print(base_par.to_dot()) + print(pre_par.to_dot()) + + run_test() + + +import time +def wait_for_futures(client: FlinkClientSync): + print("waiting") + done = False + while not done: + done = True + for event_id, fut in client._futures.items(): + result = fut["ret"] + if result is None: + done = False + time.sleep(0.5) + break + futures = client._futures + return futures + +def generate_event(exp: Literal["baseline", "prefetch"], chance: float): + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] + prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] + df = prefetch if exp == "prefetch" else baseline + + return df.generate_event({"branch_chance_0": chance}) + +def runner(args): + chance, bursts, requests_per_second, exp = args + client = FlinkClientSync(IN_TOPIC, OUT_TOPIC) + sleep_time = 0.95 / requests_per_second + + start = timer() + for b in range(bursts): + sec_start = timer() + + # send burst of messages + for i in range(requests_per_second): + + # sleep sometimes between messages + # if i % (messages_per_burst // sleeps_per_burst) == 0: + time.sleep(sleep_time) + event = generate_event(exp, chance) + client.send(event) + + client.flush() + sec_end = timer() + + # wait out the second + lps = sec_end - sec_start + if lps < 1: + time.sleep(1 - lps) + + end = timer() + avg_send_latency = (end - start) / bursts + print(f'Average send latency per burst for generator was: {avg_send_latency}') + if avg_send_latency > 1.1: + print(f'This is higher than expected (1). Maybe increase the number of threads?') + futures = wait_for_futures(client) + client.close() + return futures + +def run_test(): + logger = logging.getLogger("cascade") + logger.setLevel("INFO") + + + + parser = argparse.ArgumentParser(description="Run the benchmark and save results.") + parser.add_argument("--requests_per_second", type=int, default=10, help="Number of messages per burst") + parser.add_argument("--seconds", type=int, default=100, help="Number of seconds to benchmark for") + parser.add_argument("--threads", type=int, default=1, help="Number of concurrent threads") + parser.add_argument("--chance", type=float, default=0.5, help="Chance") + parser.add_argument("--experiment", type=str, default="baseline", help="Experiment type") + args = parser.parse_args() + + assert args.experiment in ["baseline", "prefetch"] + rps_per_thread = int(args.requests_per_second / args.threads) + print(f"{args.chance} - {args.experiment}: {args.requests_per_second} rps for {args.seconds}s") + print(f"Actual requests per second is {int(rps_per_thread * args.threads)} (due to rounding)") + + + func_args = [(args.chance, args.seconds,rps_per_thread,args.experiment)] + with Pool(args.threads) as p: + results = p.map(runner, func_args) + + results = {k: v for d in results for k, v in d.items()} + + count = Counter([r["ret"].result for r in results.values()]) + print(count) + df = to_pandas(results) + df.to_csv(f"{args.experiment}_{args.chance}_{args.requests_per_second}.csv") + + + +def to_pandas(futures_dict): + # Prepare the data for the DataFrame + data = [] + for event_id, event_data in futures_dict.items(): + ret: EventResult = event_data.get("ret") + row = { + "event_id": event_id, + "result": ret.result if ret else None, + "flink_time": ret.metadata["flink_time"] if ret else None, + "loops": ret.metadata["loops"] if ret else None, + "latency": event_data["ret_t"][1] - event_data["sent_t"][1] if ret else None + } + data.append(row) + + # Create a DataFrame and save it as a pickle file + df = pd.DataFrame(data) + + # Multiply flink_time by 1000 to convert to milliseconds + df['flink_time'] = df['flink_time'] * 1000 + flink_time = df['flink_time'].median() + latency = df['latency'].median() + flink_prct = float(flink_time) * 100 / latency + print(f"Median latency : {latency:.2f} ms") + print(f"Median Flink time : {flink_time:.2f} ms ({flink_prct:.2f}%)") + + latency = df['latency'].mean() + print(f"Mean latency : {latency:.2f} ms") + + + return df + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/dynamic_prefetching/submit_job.py b/experiments/dynamic_prefetching/submit_job.py new file mode 100644 index 0000000..44a5982 --- /dev/null +++ b/experiments/dynamic_prefetching/submit_job.py @@ -0,0 +1,39 @@ +import cascade +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize_until_if +from experiments.dynamic_prefetching.run_prefetcher import gen_parallel +from tests.integration.flink.utils import create_topics, init_flink_runtime + + +KAFKA_BROKER = "localhost:9092" +KAFKA_FLINK_BROKER = "kafka:9093" # If running a flink cluster and kafka inside docker, the broker url might be different + +IN_TOPIC = "prefetcher-in" +OUT_TOPIC = "prefetcher-out" +INTERNAL_TOPIC = "prefetcher-internal" + + + +def main(): + create_topics(IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) + + runtime = init_flink_runtime("experiments.dynamic_prefetching.entities", IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC, kafka_broker=KAFKA_FLINK_BROKER,bundle_time=5, bundle_size=10, thread_mode=True, parallelism=None) + + + print(cascade.core.dataflows.keys()) + + baseline = cascade.core.dataflows[DataflowRef("Prefetcher", "baseline")] + prefetch = cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch")] + + pre_par = gen_parallel(prefetch) + cascade.core.dataflows[DataflowRef("Prefetcher", "prefetch_parallel")] = pre_par + runtime.add_dataflow(pre_par) + + base_par = gen_parallel(baseline) + cascade.core.dataflows[DataflowRef("Prefetcher", "baseline_parallel")] = base_par + runtime.add_dataflow(base_par) + + runtime.run() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/notebooks/dataflow_example.ipynb b/notebooks/dataflow_example.ipynb index 099343e..1fd2525 100644 --- a/notebooks/dataflow_example.ipynb +++ b/notebooks/dataflow_example.ipynb @@ -42,7 +42,7 @@ "from cascade.frontend.dataflow_analysis.class_list_builder import ClassListBuilder\n", "from cascade.frontend.dataflow_analysis.class_wrapper import ClassWrapper\n", "from cascade.frontend.util import setup_cfg, plot_graph_with_color, plot_dataflow_graph\n", - "from cascade.frontend.intermediate_representation import StatementDataflowGraph, DataflowGraph" + "from cascade.frontend.cfg import ControlFlowGraph, DataflowGraph" ] }, { @@ -97,7 +97,7 @@ "cfg = setup_cfg(example)\n", "class_list: ClassList = ClassListBuilder.build(cfg)\n", "entity_1: ClassWrapper = class_list.get_class_by_name('User')\n", - "dataflow_graph: StatementDataflowGraph = entity_1.methods['checkout']\n", + "dataflow_graph: ControlFlowGraph = entity_1.methods['checkout']\n", "G = dataflow_graph.graph\n", "grouper: GroupDataflowNodes = GroupDataflowNodes(G)\n", "groups = grouper.group_nodes()\n", @@ -125,7 +125,7 @@ "\n", "class_list: ClassList = ClassListBuilder.build(cfg)\n", "entity_1: ClassWrapper = class_list.get_class_by_name('User')\n", - "dataflow_graph: StatementDataflowGraph = entity_1.methods['checkout']\n", + "dataflow_graph: ControlFlowGraph = entity_1.methods['checkout']\n", "G = dataflow_graph.graph\n", "grouper: GroupDataflowNodes = GroupDataflowNodes(G)\n", "groups = grouper.group_nodes()\n", @@ -313,13 +313,13 @@ }, { "cell_type": "code", - "execution_count": 389, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from textwrap import indent\n", - "from cascade.frontend.generator.generate_split_functions import GenerateSplittFunctions\n", - "from cascade.frontend.intermediate_representation import Block\n", + "from cascade.frontend.generator.generate_split_functions import GenerateSplitFunctions\n", + "from cascade.frontend.cfg import Block\n", "\n", "compiled_functions, df = GenerateSplittFunctions.generate_split_function_string(block_level_dataflow_graph)" ] @@ -440,7 +440,7 @@ " cfg = setup_cfg(example)\n", " class_list: ClassList = ClassListBuilder.build(cfg)\n", " entity_1: ClassWrapper = class_list.get_class_by_name('User')\n", - " dataflow_graph: StatementDataflowGraph = entity_1.methods['buy_item']\n", + " dataflow_graph: ControlFlowGraph = entity_1.methods['buy_item']\n", " return dataflow_graph" ] }, @@ -485,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 456, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -504,7 +504,7 @@ } ], "source": [ - "split_functions = GenerateSplittFunctions.generate(dataflow_graph)\n", + "split_functions = GenerateSplitFunctions.generate(dataflow_graph)\n", "\n", "\n", "for i, split in enumerate(split_functions):\n", @@ -617,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 452, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -639,7 +639,7 @@ } ], "source": [ - "split_functions = GenerateSplittFunctions.generate(dataflow_graph)\n", + "split_functions = GenerateSplitFunctions.generate(dataflow_graph)\n", "\n", "\n", "for i, split in enumerate(split_functions):\n", diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..6503113 --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,13 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'flink-jobmanager' + static_configs: + - targets: ['jobmanager:9250'] + + - job_name: 'flink-taskmanagers' + dns_sd_configs: + - names: ['taskmanager'] + type: A + port: 9250 \ No newline at end of file diff --git a/run_experiments.py b/run_experiments.py index 3cf327e..58934c0 100755 --- a/run_experiments.py +++ b/run_experiments.py @@ -5,43 +5,55 @@ args = { "messages_per_burst": 10, "sleeps_per_burst": 10, - "sleep_time": 0.08, + "sleep_time": 0.09, "seconds_per_burst": 1, "bursts": 100 } +mps_1 = { + **args, + "messages_per_burst": 1, + "sleeps_per_burst": 1, + "sleep_time": 0.9, +} + mps_20 = { **args, "messages_per_burst": 20, "sleeps_per_burst": 20, - "sleep_time": 0.08/2, + "sleep_time": 0.09/2, +} + +mps_30 = { + **args, + "messages_per_burst": 30, + "sleeps_per_burst": 30, + "sleep_time": 0.09/3, } mps_50 = { **args, "messages_per_burst": 50, "sleeps_per_burst": 50, - "sleep_time": 0.08/5, + "sleep_time": 0.09/5, } + # Define experiment parameters as a list of dictionaries experiments = [ - {"parallelism": 16, "benchmark_args": {**args}}, - {"parallelism": 16, "benchmark_args": {**mps_20}}, - {"parallelism": 16, "benchmark_args": {**mps_50}}, - - {"parallelism": 8, "benchmark_args": {**args}}, - {"parallelism": 8, "benchmark_args": {**mps_20}}, - - {"parallelism": 4, "benchmark_args": {**mps_20}}, - {"parallelism": 4, "benchmark_args": {**args}}, - - {"parallelism": 2, "benchmark_args": {**args}}, - {"parallelism": 2, "benchmark_args": {**mps_20}}, - - {"parallelism": 1, "benchmark_args": {**args}}, - {"parallelism": 1, "benchmark_args": {**mps_20}}, + # {"parallelism": 16, "benchmark_args": {**args}}, + # {"parallelism": 8, "benchmark_args": {**args}}, + # {"parallelism": 4, "benchmark_args": {**args}}, + # {"parallelism": 2, "benchmark_args": {**args}}, + # {"parallelism": 1, "benchmark_args": {**args}}, + + # {"parallelism": 16, "benchmark_args": {**mps_20}}, + # {"parallelism": 8, "benchmark_args": {**mps_20}}, + # {"parallelism": 4, "benchmark_args": {**mps_20}}, + # {"parallelism": 2, "benchmark_args": {**mps_20}}, + # {"parallelism": 1, "benchmark_args": {**mps_20}}, + {"parallelism": 16, "benchmark_args": {**mps_50}}, {"parallelism": 8, "benchmark_args": {**mps_50}}, {"parallelism": 4, "benchmark_args": {**mps_50}}, {"parallelism": 2, "benchmark_args": {**mps_50}}, @@ -54,7 +66,7 @@ print("Tearing down docker containers") subprocess.run(["docker", "compose", "down"], check=True) -for e in ["parallel", "base", "piplined"]: +for e in ["pipelined", "parallel", "baseline"]: for exp in experiments: print(f"Starting experiment {exp}") @@ -74,9 +86,9 @@ subprocess.run(flink_cmd, check=True, env=env) # Start benchmark - filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['messages_per_burst']}.plk" + filename = f"{e}_p-{exp['parallelism']}_mps-{exp['benchmark_args']['messages_per_burst']}.pkl" benchmark_cmd = [ - "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename + "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename, "--experiment", e ] for arg, val in exp['benchmark_args'].items(): diff --git a/run_experiments_gil_workaround copy.py b/run_experiments_gil_workaround copy.py new file mode 100755 index 0000000..4ab412f --- /dev/null +++ b/run_experiments_gil_workaround copy.py @@ -0,0 +1,74 @@ +import os +import subprocess +import time + +def rps(num, producer_threads=1): + return { + "threads": producer_threads, + "requests_per_second": num, + "seconds": 100, + } + + +# Define experiment parameters as a list of dictionaries +experiments = [ + {"parallelism": 24, "benchmark_args": {**rps(500, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1000, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1500, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(2000, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(2500, producer_threads=20)}}, + {"parallelism": 24, "benchmark_args": {**rps(250, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(750, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1250, producer_threads=10)}}, + {"parallelism": 24, "benchmark_args": {**rps(1750, producer_threads=25)}}, + {"parallelism": 24, "benchmark_args": {**rps(2250, producer_threads=25)}}, +] + + + + +print("Tearing down docker containers") +subprocess.run(["docker", "compose", "down"], check=False) + +for e in ["baseline", "parallel"]: + for exp in experiments: + print(f"Starting experiment {exp}") + + # Start docker compose + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}", "--force-recreate"], check=True, env={ + "TASK_SLOTS": "1" + }) + + time.sleep(10) + + # Run Flink job + + flink_cmd = [ + "flink", "run", "--pyFiles", "/home/lvanmol/cascade/src,/home/lvanmol/cascade", + "--pyModule", "deathstar_movie_review.demo", "-d", "-p", str(exp['parallelism']) + ] + env = os.environ + env["EXPERIMENT"] = e + subprocess.run(flink_cmd, check=True, env=env) + + # Start benchmark + filename = f"{e}_p-{exp['parallelism']}_rps-{exp['benchmark_args']['requests_per_second']}.pkl" + benchmark_cmd = [ + "python", "-u", "-m", "deathstar_movie_review.start_benchmark", "--output", filename, "--experiment", e + ] + + for arg, val in exp['benchmark_args'].items(): + benchmark_cmd.append(f"--{arg}") + benchmark_cmd.append(str(val)) + subprocess.run(benchmark_cmd, check=True) + + # Sleep for experiment duration + # print(f"Sleeping for {exp['sleep']} seconds...") + # time.sleep(exp['sleep']) + + # Stop docker compose + subprocess.run(["docker", "compose", "down"], check=False) + + print(f"Experiment completed.") + +print("All experiments completed.") diff --git a/run_prefetch_exp.py b/run_prefetch_exp.py new file mode 100755 index 0000000..b2879e8 --- /dev/null +++ b/run_prefetch_exp.py @@ -0,0 +1,68 @@ +import os +import subprocess +import time + +def rps(num, branch_chance, producer_threads=1): + return { + "threads": producer_threads, + "requests_per_second": num, + "seconds": 50, + "branch_chance": branch_chance + } + + +# Define experiment parameters as a list of dictionaries +experiments = [ + {"parallelism": 4, "benchmark_args": {**rps(500, 0.1, producer_threads=10)}}, + {"parallelism": 4, "benchmark_args": {**rps(500, 0.5, producer_threads=10)}}, + {"parallelism": 4, "benchmark_args": {**rps(500, 0.9, producer_threads=10)}}, +] + + + + +print("Tearing down docker containers") +subprocess.run(["docker", "compose", "down"], check=False) + +for e in ["baseline", "prefetch"]: + for exp in experiments: + print(f"Starting experiment {exp}") + + # Start docker compose + subprocess.run(["docker", "compose", "up", "-d", "--scale", f"taskmanager={exp['parallelism']}", "--force-recreate"], check=True, env={ + "TASK_SLOTS": "1" + }) + + time.sleep(10) + + # Run Flink job + + flink_cmd = [ + "flink", "run", "--pyFiles", "/home/lvanmol/cascade/src,/home/lvanmol/cascade", + "--pyModule", "deathstar_movie_review.demo", "-d", "-p", str(exp['parallelism']) + ] + env = os.environ + env["EXPERIMENT"] = e + subprocess.run(flink_cmd, check=True, env=env) + + # Start benchmark + filename = f"{e}_p-{exp['parallelism']}_rps-{exp['benchmark_args']['requests_per_second']}_chance-{exp['benchmark_args']['branch_chance']}.pkl" + benchmark_cmd = [ + "python", "-u", "-m", "deathstar_movie_review.start_prefetch_experiment", "--output", filename, "--experiment", e + ] + + for arg, val in exp['benchmark_args'].items(): + benchmark_cmd.append(f"--{arg}") + benchmark_cmd.append(str(val)) + subprocess.run(benchmark_cmd, check=True) + + # Sleep for experiment duration + # print(f"Sleeping for {exp['sleep']} seconds...") + # time.sleep(exp['sleep']) + + # Stop docker compose + subprocess.run(["docker", "compose", "down"], check=False) + + print(f"Experiment completed.") + +print("All experiments completed.") diff --git a/src/cascade/core.py b/src/cascade/core.py index ae53c65..9f20e36 100644 --- a/src/cascade/core.py +++ b/src/cascade/core.py @@ -2,84 +2,105 @@ from typing import Dict from klara.core import nodes -from klara.core.tree_rewriter import AstBuilder -from klara.core.cfg import Cfg - +from cascade.dataflow.operator import StatefulOperator, StatelessOperator, Operator +from cascade.preprocessing import setup_cfg from cascade.wrappers import ClassWrapper -from cascade.descriptors import ClassDescriptor, MethodDescriptor -from cascade.frontend.generator.generate_split_functions import GenerateSplittFunctions -from cascade.frontend.generator.generate_dataflow import GenerateDataflow -from cascade.dataflow.dataflow import DataFlow -from cascade.frontend.intermediate_representation import StatementDataflowGraph -from cascade.frontend.generator.build_compiled_method_string import BuildCompiledMethodsString -from cascade.frontend.ast_visitors import ExtractTypeVisitor +from cascade.descriptors import ClassDescriptor +from cascade.frontend.generator.dataflow_builder import DataflowBuilder +from cascade.dataflow.dataflow import CallLocal, DataFlow, DataflowRef, InitClass -def setup_cfg(code: str) -> Cfg: - as_tree = AstBuilder().string_build(code) - cfg = Cfg(as_tree) - cfg.convert_to_ssa() - return cfg, as_tree parse_cache: Dict[str, nodes.Module] = {} registered_classes: list[ClassWrapper] = [] - -def cascade(cls, parse_file=True): - if not isclass(cls): - raise AttributeError(f"Expected a class but got an {cls}.") - - # Parse source. - if parse_file: - class_file_name = getfile(cls) - if class_file_name not in parse_cache: - with open(class_file_name, "r") as file: - to_parse_file = file.read() - # parsed_cls = AstBuilder().string_build(to_parse_file) - parsed_cls, tree = setup_cfg(to_parse_file) - parse_cache[class_file_name] = (parsed_cls, tree) +operators: dict[str, Operator] = {} +dataflows: dict[DataflowRef, DataFlow] = {} + +def cascade(cls=None, *, parse_file=True, globals=None): + + def decorator(cls): + if not isclass(cls): + raise AttributeError(f"Expected a class but got an {cls}.") + + # Parse source. + if parse_file: + class_file_name = getfile(cls) + if class_file_name not in parse_cache: + with open(class_file_name, "r") as file: + to_parse_file = file.read() + # parsed_cls = AstBuilder().string_build(to_parse_file) + parsed_cls, tree = setup_cfg(to_parse_file) + parse_cache[class_file_name] = (parsed_cls, tree) + else: + parsed_cls, tree = parse_cache[class_file_name] else: - parsed_cls, tree = parse_cache[class_file_name] - else: - class_source = getsource(cls) - parsed_cls, tree = setup_cfg(class_source) + class_source = getsource(cls) + parsed_cls, tree = setup_cfg(class_source) - # Create class descripter for class - class_desc: ClassDescriptor = ClassDescriptor.from_module(cls.__name__, tree) - class_wrapper: ClassWrapper = ClassWrapper(cls, class_desc) - registered_classes.append(class_wrapper) + # Create class descripter for class + class_desc: ClassDescriptor = ClassDescriptor.from_module(cls.__name__, tree, globals) + class_wrapper: ClassWrapper = ClassWrapper(cls, class_desc) + registered_classes.append(class_wrapper) + + # Support both @cascade and @cascade(globals={...}) + if cls is None: + return decorator + return decorator(cls) def init(): + # First pass: register operators/classes for cls in registered_classes: - for method in cls.class_desc.methods_dec: - method.build_dataflow() + op_name = cls.class_desc.class_name + if cls.class_desc.is_stateless: + op = StatelessOperator(cls.cls, {}, {}) + else: + op = StatefulOperator(cls.cls, {}, {}) -def get_entity_names() -> str: - """Returns a list with the names of all registered entities""" - return [cls.class_desc.class_name for cls in registered_classes] - + op: Operator = op -def get_compiled_methods() -> str: - """Returns a list with the compiled methods as string""" - compiled_methods: list[str] = [] - entities: list[str] = get_entity_names() + # generate split functions + for method in cls.class_desc.methods_dec: + df_ref = DataflowRef(op_name, method.method_name) + # Add version number manually + args = [f"{str(arg)}_0" for arg in method.method_node.args.args] + # TODO: cleaner solution that checks if the function is stateful or not + if len(args) > 0 and args[0] == "self_0": + args = args[1:] + dataflows[df_ref] = DataFlow(method.method_name, op_name, args) + + operators[op_name] = op + + # Second pass: build dataflows for cls in registered_classes: - cls_desc: ClassDescriptor = cls.class_desc - for method_desc in cls_desc.methods_dec: - if method_desc.method_name == '__init__': - continue - dataflow_graph: StatementDataflowGraph = method_desc.dataflow - instance_type_map: dict[str, str] = ExtractTypeVisitor.extract(method_desc.method_node) - split_functions = GenerateSplittFunctions.generate(dataflow_graph, cls_desc.class_name, entities, instance_type_map) - df: DataFlow = GenerateDataflow.generate(split_functions, instance_type_map) - class_compiled_methods: str = BuildCompiledMethodsString.build(split_functions) - compiled_methods.append(class_compiled_methods) - - return '\n\n'.join(compiled_methods) + op_name = cls.class_desc.class_name + op = operators[op_name] + + # generate split functions + for method in cls.class_desc.methods_dec: + if method.method_name == "__init__": + df = DataFlow("__init__", op_name) + n0 = CallLocal(InitClass()) + df.entry = [n0] + blocks = [] + else: + df = DataflowBuilder(method.method_node, cls.class_desc.globals).build(dataflows, op_name) + + dataflows[df.ref()] = df + op.dataflows[df.ref()] = df + for name, b in df.blocks.items(): + op.methods[name] = b + + +def get_operator(op_name: str): + return operators[op_name] + +def get_dataflow(ref: DataflowRef): + return dataflows[ref] def clear(): diff --git a/src/cascade/dataflow/dataflow.py b/src/cascade/dataflow/dataflow.py index bb5704b..1d49674 100644 --- a/src/cascade/dataflow/dataflow.py +++ b/src/cascade/dataflow/dataflow.py @@ -1,15 +1,15 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Any, Callable, List, Optional, Type, Union +import logging +from typing import Any, Iterable, List, Mapping, Optional, Union from typing import TYPE_CHECKING +import uuid + if TYPE_CHECKING: - # Prevent circular imports - from cascade.dataflow.operator import StatelessOperator - + from cascade.frontend.generator.local_block import CompiledLocalBlock + from cascade.dataflow.operator import Operator -class Operator(ABC): - pass @dataclass class InitClass: @@ -24,10 +24,6 @@ class InvokeMethod: def __repr__(self) -> str: return f"{self.__class__.__name__}('{self.method_name}')" -@dataclass -class Filter: - """Filter by this function""" - filter_fn: Callable @dataclass class Node(ABC): @@ -45,146 +41,155 @@ def __post_init__(self): Node._id_counter += 1 @abstractmethod - def propogate(self, event: 'Event', targets: list['Node'], result: Any, **kwargs) -> list['Event']: + def propogate(self, event: 'Event', targets: list['Node'], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> list['Event']: pass @dataclass -class OpNode(Node): - """A node in a `Dataflow` corresponding to a method call of a `StatefulOperator`. - - A `Dataflow` may reference the same entity multiple times. - The `StatefulOperator` that this node belongs to is referenced by `entity`.""" - entity: Type - method_type: Union[InitClass, InvokeMethod, Filter] - read_key_from: str - """Which variable to take as the key for this StatefulOperator""" - - assign_result_to: Optional[str] = field(default=None) - """What variable to assign the result of this node to, if any.""" - is_conditional: bool = field(default=False) - """Whether or not the boolean result of this node dictates the following path.""" - collect_target: Optional['CollectTarget'] = field(default=None) - """Whether the result of this node should go to a CollectNode.""" - - def propogate(self, event: 'Event', targets: List[Node], result: Any) -> list['Event']: - return OpNode.propogate_opnode(self, event, targets, result) - - @staticmethod - def propogate_opnode(node: Union['OpNode', 'StatelessOpNode'], event: 'Event', targets: list[Node], - result: Any) -> list['Event']: - num_targets = 1 if node.is_conditional else len(targets) - - if event.collect_target is not None: - # Assign new collect targets - collect_targets = [ - event.collect_target for i in range(num_targets) - ] - else: - # Keep old collect targets - collect_targets = [node.collect_target for i in range(num_targets)] - - if node.is_conditional: - edges = event.dataflow.nodes[event.target.id].outgoing_edges - true_edges = [edge for edge in edges if edge.if_conditional] - false_edges = [edge for edge in edges if not edge.if_conditional] - if not (len(true_edges) == len(false_edges) == 1): - print(edges) - assert len(true_edges) == len(false_edges) == 1 - target_true = true_edges[0].to_node - target_false = false_edges[0].to_node - - assert len(collect_targets) == 1, "num targets should be 1" - ct = collect_targets[0] - - return [Event( - target_true if result else target_false, - event.variable_map, +class Return(Node): + return_var: str + """The name of the local variable to return.""" + + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + events = [] + for target in targets: + ev = Event( + target, + event.variable_map, event.dataflow, + call_stack=event.call_stack, _id=event._id, - collect_target=ct, - metadata=event.metadata) - ] + metadata=event.metadata, + key=event.key) + + events.append(ev) + return events + +@dataclass +class IfNode(Node): + predicate_var: str + """The name of the local (boolean) variable to use as predicate.""" - else: - return [Event( - target, - event.variable_map, - event.dataflow, - _id=event._id, - collect_target=ct, - metadata=event.metadata) - - for target, ct in zip(targets, collect_targets)] + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + + if_cond = event.variable_map[self.predicate_var] + targets = [] + for edge in event.target.outgoing_edges: + assert edge.if_conditional is not None + if edge.if_conditional == if_cond: + targets.append(edge.to_node) + + + events = [] + for target in targets: + ev = Event( + target, + event.variable_map, + event.dataflow, + call_stack=event.call_stack, + _id=event._id, + metadata=event.metadata, + key=event.key) + + events.append(ev) + return events - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self.entity.__name__}, {self.method_type})" + def __str__(self) -> str: + return f"IF {self.predicate_var}" @dataclass -class StatelessOpNode(Node): - """A node in a `Dataflow` corresponding to a method call of a `StatelessOperator`. +class DataflowRef: + operator_name: str + dataflow_name: str + + # def get_dataflow(self) -> 'DataFlow': + # try: + # return cascade.core.dataflows[self] + # except KeyError as e: + # raise KeyError(f"DataflowRef {self} not found in cascade.core.dataflows") - A `Dataflow` may reference the same `StatefulOperator` multiple times. - The `StatefulOperator` that this node belongs to is referenced by `cls`.""" - operator: 'StatelessOperator' - method_type: InvokeMethod - """Which variable to take as the key for this StatefulOperator""" + def __repr__(self) -> str: + return f"{self.operator_name}.{self.dataflow_name}" - assign_result_to: Optional[str] = None - """What variable to assign the result of this node to, if any.""" - is_conditional: bool = False - """Whether or not the boolean result of this node dictates the following path.""" - collect_target: Optional['CollectTarget'] = None - """Whether the result of this node should go to a CollectNode.""" + def __hash__(self) -> int: + return hash(repr(self)) + - def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['Event']: - return OpNode.propogate_opnode(self, event, targets, result) - @dataclass -class DataflowNode(Node): +class CallRemote(Node): """A node in a `DataFlow` corresponding to the call of another dataflow""" - dataflow: 'DataFlow' + dataflow: 'DataflowRef' + """The dataflow to call.""" + variable_rename: dict[str, str] - + """A mapping of input variables (to the dataflow) to variables in the variable map""" + assign_result_to: Optional[str] = None """What variable to assign the result of this node to, if any.""" - is_conditional: bool = False - """Whether or not the boolean result of this node dictates the following path.""" - collect_target: Optional['CollectTarget'] = None - """Whether the result of this node should go to a CollectNode.""" - def propogate(self, event: 'Event', targets: List[Node], result: Any) -> List['Event']: + keyby: Optional[str] = None + """The key, for calls to Stateful Entities""" + + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow']) -> List['Event']: # remap the variable map of event into the new event + new_var_map = {key: event.variable_map[value] for key, value in self.variable_rename.items()} + if self.keyby: + new_key = event.variable_map[self.keyby] + else: + new_key = None + df = df_map[self.dataflow] + new_targets = df.entry + + # Tail call elimination: + # "targets" corresponds to where to go after this CallRemote finishes + # the call to self.dataflow + # + # If this CallRemote is a terminal node in event.dataflow, then we don't + # need to go back to event.dataflow, so we don't add it to the call stack. + # This node is terminal in event.dataflow iff len(targets) == 0 + new_call_stack = event.call_stack + if len(targets) > 0: + new_call_stack = event.call_stack.copy() + call = CallStackItem(event.dataflow, self.assign_result_to, event.variable_map, targets, key=event.key) + new_call_stack.append(call) - # add the targets as some sort of dataflow "exit nodes" - return self.dataflow + return [Event( + target, + new_var_map, + self.dataflow, + _id=event._id, + metadata=event.metadata, + call_stack=new_call_stack, + key=new_key) + + for target in new_targets] + + def __str__(self) -> str: + return f"CALL {self.dataflow}" @dataclass -class SelectAllNode(Node): - """A node type that will yield all items of an entity filtered by - some function. - - Think of this as executing `SELECT * FROM cls`""" - cls: Type - collect_target: 'CollectNode' - assign_key_to: str - - def propogate(self, event: 'Event', targets: List[Node], result: Any, keys: list[str]): - targets = event.dataflow.get_neighbors(event.target) - assert len(targets) == 1 - n = len(keys) - collect_targets = [ - CollectTarget(self.collect_target, n, i) - for i in range(n) - ] - return [Event( - targets[0], - event.variable_map | {self.assign_key_to: key}, - event.dataflow, - _id=event._id, - collect_target=ct, - metadata=event.metadata) - for ct, key in zip(collect_targets, keys)] +class CallLocal(Node): + method: Union[InvokeMethod, InitClass] + + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: + # For simple calls, we only need to change the target. + # Multiple targets results in multiple events + events = [] + for target in targets: + ev = Event( + target, + event.variable_map, + event.dataflow, + call_stack=event.call_stack, + _id=event._id, + metadata=event.metadata, + key=event.key) + + events.append(ev) + return events + + def __str__(self) -> str: + return f"LOCAL {self.method}" @dataclass class CollectNode(Node): @@ -192,29 +197,29 @@ class CollectNode(Node): It will aggregate incoming edges and output them as a list to the outgoing edge. Their actual implementation is runtime-dependent.""" - assign_result_to: str - """The variable name in the variable map that will contain the collected result.""" - read_results_from: str - """The variable name in the variable map that the individual items put their result in.""" + num_events: int - def propogate(self, event: 'Event', targets: List[Node], result: Any, **kwargs) -> List['Event']: - collect_targets = [event.collect_target for i in range(len(targets))] + def propogate(self, event: 'Event', targets: List[Node], df_map: dict['DataflowRef', 'DataFlow'], **kwargs) -> List['Event']: return [Event( target, event.variable_map, event.dataflow, _id=event._id, - collect_target=ct, - metadata=event.metadata) + call_stack=event.call_stack, + # collect_target=ct, + metadata=event.metadata, + key=event.key) - for target, ct in zip(targets, collect_targets)] + for target in targets] + + def __str__(self) -> str: + return f"COLLECT {self.num_events}" @dataclass class Edge(): """An Edge in the Dataflow graph.""" from_node: Node to_node: Node - variable_map: dict[str, Any] = field(default_factory=dict) if_conditional: Optional[bool] = None class DataFlow: @@ -240,18 +245,34 @@ class DataFlow: collect-- [item1_price, item2_price] -->user2; ``` """ - def __init__(self, name: str): + + def __init__(self, name: str, op_name: str, args: Optional[list[str]]=None): self.name: str = name self.adjacency_list: dict[int, list[int]] = {} self.nodes: dict[int, Node] = {} - self.entry: Union[Node, List[Node]] = None + self.edges: list[Edge] = [] + self.entry: List[Node] = [] + self.operator_name = op_name + if args: + self.args: list[str] = args + else: + self.args = [] + self.blocks: dict[str, 'CompiledLocalBlock'] = {} + + def ref(self) -> DataflowRef: + return DataflowRef(self.operator_name, self.name) def add_node(self, node: Node): """Add a node to the Dataflow graph if it doesn't already exist.""" if node.id not in self.adjacency_list: + node.outgoing_edges = [] self.adjacency_list[node.id] = [] self.nodes[node.id] = node + def add_block(self, block: 'CompiledLocalBlock'): + self.blocks[block.get_method_name()] = block + + def add_edge(self, edge: Edge): """Add an edge to the Dataflow graph. Nodes that don't exist will be added to the graph automatically.""" self.add_node(edge.from_node) @@ -259,7 +280,13 @@ def add_edge(self, edge: Edge): if edge.to_node.id not in self.adjacency_list[edge.from_node.id]: self.adjacency_list[edge.from_node.id].append(edge.to_node.id) edge.from_node.outgoing_edges.append(edge) + self.edges.append(edge) + def add_edge_refs(self, u: int, v: int, if_conditional=None): + """Add an edge using node IDs""" + from_node = self.nodes[u] + to_node = self.nodes[v] + self.add_edge(Edge(from_node, to_node, if_conditional=if_conditional)) def remove_edge(self, from_node: Node, to_node: Node): """Remove an edge from the Dataflow graph.""" @@ -271,28 +298,33 @@ def remove_edge(self, from_node: Node, to_node: Node): edge for edge in from_node.outgoing_edges if edge.to_node.id != to_node.id ] + # TODO: replace self.edges with a better algorithm for removal. + # probably by adding edge information (like edge.if_conditional, or future things) + # to self.adjacencylist + for i, edge in enumerate(self.edges): + if edge.from_node == from_node and edge.to_node == to_node: + break + self.edges.pop(i) + def remove_node(self, node: Node): + return self.remove_node_by_id(node.id) + + def remove_node_by_id(self, node_id: int): """Remove a node from the DataFlow graph and reconnect its parents to its children.""" - if node.id not in self.nodes: + if node_id not in self.nodes: return # Node doesn't exist in the graph - if isinstance(node, OpNode) or isinstance(node, StatelessOpNode): - assert not node.is_conditional, "there's no clear way to remove a conditional node" - assert not node.assign_result_to, "can't delete node whose result is used" - assert not node.collect_target, "can't delete node which has a collect_target" - # Find parents (nodes that have edges pointing to this node) - parents = [parent_id for parent_id, children in self.adjacency_list.items() if node.id in children] + parents = [parent_id for parent_id, children in self.adjacency_list.items() if node_id in children] # Find children (nodes that this node points to) - children = self.adjacency_list[node.id] + children = self.adjacency_list[node_id] # Set df entry - if self.entry == node: - print(children) - assert len(children) == 1, "cannot remove entry node if it doesn't exactly one child" - self.entry = self.nodes[children[0]] + if len(self.entry) == 1 and self.entry[0].id == node_id: + assert len(children) <= 1, "cannot remove entry node if it has more than two children" + self.entry = [self.nodes[id] for id in children] # Connect each parent to each child for parent_id in parents: @@ -305,57 +337,72 @@ def remove_node(self, node: Node): # Remove edges from parents to the node for parent_id in parents: parent_node = self.nodes[parent_id] - self.remove_edge(parent_node, node) + self.remove_edge(parent_node, self.nodes[node_id]) # Remove outgoing edges from the node for child_id in children: child_node = self.nodes[child_id] - self.remove_edge(node, child_node) - + self.remove_edge(self.nodes[node_id], child_node) - # Remove the node from the adjacency list and nodes dictionary - del self.adjacency_list[node.id] - del self.nodes[node.id] + del self.adjacency_list[node_id] + del self.nodes[node_id] def get_neighbors(self, node: Node) -> List[Node]: """Get the outgoing neighbors of this `Node`""" + return [edge.to_node for edge in node.outgoing_edges] + # TODO: there is a bug with the underlying adjacency_list: + # it doesn't get updated properly sometimes (during parallelization), + # but seemingly only when modifiying when running flink without minicluster + # mode. return [self.nodes[id] for id in self.adjacency_list.get(node.id, [])] + def get_predecessors(self, node: Node) -> List[Node]: + """Get the predeccors of this node by following incoming edges""" + return [self.nodes[id] for id, adj in self.adjacency_list.items() if node.id in adj] + + def to_dot(self) -> str: """Output the DataFlow graph in DOT (Graphviz) format.""" - lines = [f"digraph {self.name} {{"] + lines = [f"digraph {self.operator_name}_{self.name} {{"] # Add nodes for node in self.nodes.values(): - lines.append(f' {node.id} [label="{node}"];') + lines.append(f' {node.id} [label="{str(node)}"];') # Add edges - for from_id, to_ids in self.adjacency_list.items(): - for to_id in to_ids: - lines.append(f" {from_id} -> {to_id};") + for edge in self.edges: + line = f" {edge.from_node.id} -> {edge.to_node.id}" + if edge.if_conditional is not None: + line += f' [label="{edge.if_conditional}"]' + line += ";" + lines.append(line) lines.append("}") return "\n".join(lines) - def generate_event(self, variable_map: dict[str, Any]) -> Union['Event', list['Event']]: - if isinstance(self.entry, list): - assert len(self.entry) != 0 - first_event = Event(self.entry[0], variable_map, self) - id = first_event._id - return [first_event] + [Event(entry, variable_map, self, _id=id) for entry in self.entry[1:]] - else: - return Event(self.entry, variable_map, self) - -@dataclass -class CollectTarget: - target_node: CollectNode - """Target node""" - total_items: int - """How many items the merge node needs to wait on (including this one).""" - result_idx: int - """The index this result should be in the collected array.""" + def generate_event(self, variable_map: dict[str, Any], key: Optional[str] = None) -> list['Event']: + import cascade + assert len(self.entry) != 0 + # give all the events the same id + first_event = Event(self.entry[0], variable_map, self.ref(), key=key) + id = first_event._id + events = [first_event] + [Event(entry, variable_map, self.ref(), _id=id, key=key) for entry in self.entry[1:]] + + # TODO: propogate at "compile time" instead of doing this every time + local_events = [] + for ev in events: + if isinstance(ev.target, CallRemote) or isinstance(ev.target, IfNode): + local_events.extend(ev.propogate(None, cascade.core.dataflows)) + else: + local_events.append(ev) + + return local_events + + + def __str__(self) -> str: + return f"{self.operator_name}.{self.name}" def metadata_dict() -> dict: return { @@ -364,6 +411,16 @@ def metadata_dict() -> dict: "flink_time": 0 } +@dataclass +class CallStackItem: + dataflow: DataflowRef + assign_result_to: Optional[str] + var_map: dict[str, str] + """Variables are saved in the call stack""" + targets: Union[Node, List[Node]] + key: Optional[str] = None + """The key to use when coming back""" + @dataclass class Event(): """An Event is an object that travels through the Dataflow graph.""" @@ -375,46 +432,75 @@ class Event(): """A mapping of variable identifiers to values. If `target` is an `OpNode` this map should include the variables needed for that method.""" - dataflow: Optional['DataFlow'] + dataflow: DataflowRef """The Dataflow that this event is a part of. If None, it won't propogate. This might be remove in the future in favour of a routing operator.""" _id: int = field(default=None) # type: ignore (will get updated in __post_init__ if unset) """Unique ID for this event. Except in `propogate`, this `id` should not be set.""" - collect_target: Optional[CollectTarget] = field(default=None) - """Tells each mergenode (key) how many events to merge on""" - _id_counter: int = field(init=False, default=0, repr=False) + call_stack: List[CallStackItem] = field(default_factory=list) + """Target used when dataflow is done, used for recursive dataflows.""" metadata: dict = field(default_factory=metadata_dict) """Event metadata containing, for example, timestamps for benchmarking""" + + key: Optional[str] = None + """If on a Stateful Operator, the key of the state""" def __post_init__(self): if self._id is None: - # Assign a unique ID from the class-level counter - self._id = Event._id_counter - Event._id_counter += 1 + # Assign a unique ID + self._id = uuid.uuid4().int - def propogate(self, result, select_all_keys: Optional[list[str]]=None) -> Union['EventResult', list['Event']]: + def propogate(self, result: Any, df_map: dict['DataflowRef','DataFlow']) -> Iterable[Union['EventResult', 'Event']]: """Propogate this event through the Dataflow.""" - - if self.dataflow is None: - return EventResult(self._id, result, self.metadata) + targets = df_map[self.dataflow].get_neighbors(self.target) - targets = self.dataflow.get_neighbors(self.target) - if len(targets) == 0: - return EventResult(self._id, result, self.metadata) + events = [] + + if len(targets) == 0 and not isinstance(self.target, CallRemote): + if len(self.call_stack) > 0: + caller = self.call_stack.pop() + + new_df = caller.dataflow + new_targets = caller.targets + if not isinstance(new_targets, list): + new_targets = [new_targets] + var_map = caller.var_map + if (x := caller.assign_result_to): + assert isinstance(self.target, Return), type(self.target) + var_map[x] = self.variable_map[self.target.return_var] + + for target in new_targets: + ev = Event( + target, + var_map, + new_df, + _id=self._id, + call_stack=self.call_stack, + metadata=self.metadata, + key=caller.key + ) + events.append(ev) + else: + if isinstance(self.target, Return): + yield EventResult(self._id, self.variable_map[self.target.return_var], self.metadata) + else: + yield EventResult(self._id, result, self.metadata) + return else: current_node = self.target + events = current_node.propogate(self, targets, df_map) - if isinstance(current_node, SelectAllNode): - assert select_all_keys - return current_node.propogate(self, targets, result, select_all_keys) + for event in events: + if isinstance(event.target, CallRemote) or isinstance(event.target, IfNode) or isinstance(event.target, Return): + # recursively propogate CallRemote events + yield from event.propogate(None, df_map) else: - return current_node.propogate(self, targets, result) - + yield event @dataclass class EventResult(): event_id: int diff --git a/src/cascade/dataflow/operator.py b/src/cascade/dataflow/operator.py index 56d3e45..acca252 100644 --- a/src/cascade/dataflow/operator.py +++ b/src/cascade/dataflow/operator.py @@ -1,10 +1,25 @@ -from abc import ABC -from typing import Any, Generic, Protocol, Type, TypeVar -from cascade.dataflow.dataflow import DataFlow, InvokeMethod, Operator +from abc import ABC, abstractmethod +from typing import Any, Generic, Mapping, Protocol, Type, TypeVar, TYPE_CHECKING + + +if TYPE_CHECKING: + from cascade.frontend.generator.local_block import CompiledLocalBlock + from cascade.dataflow.dataflow import DataFlow, InvokeMethod, DataflowRef T = TypeVar('T') +class Operator(ABC): + dataflows: dict['DataflowRef', 'DataFlow'] + methods: Mapping[str, 'CompiledLocalBlock'] + + @abstractmethod + def name(self) -> str: + pass + def get_method_rw_set(self, method_name: str) -> tuple[set[str], set[str]]: + method = self.methods[method_name] + return method.reads, method.writes + class MethodCall(Generic[T], Protocol): """A helper class for type-safety of method signature for compiled methods. @@ -26,6 +41,11 @@ def __call__(self, variable_map: dict[str, Any], state: T) -> Any: ... """@private""" +class StatelessMethodCall(Protocol): + def __call__(self, variable_map: dict[str, Any]) -> Any: ... + """@private""" + + class StatefulOperator(Generic[T], Operator): """An abstraction for a user-defined python class. @@ -38,7 +58,7 @@ class StatefulOperator(Generic[T], Operator): methods, instead reading and modifying the underlying class `T` through a state variable, see `handle_invoke_method`. """ - def __init__(self, entity: Type[T], methods: dict[str, MethodCall[T]], dataflows: dict[str, DataFlow]): + def __init__(self, entity: Type[T], methods: dict[str, 'CompiledLocalBlock'], dataflows: dict['DataflowRef', 'DataFlow']): """Create the StatefulOperator from a class and its compiled methods. Typically, a class could be comprised of split and non-split methods. Take the following example: @@ -87,18 +107,16 @@ def user_buy_item_1(variable_map: dict[str, Any], state: User): ``` """ - # methods maps function name to a function. Ideally this is done once in the object - self._methods = methods + self.methods = methods self.entity = entity self.dataflows = dataflows """A mapping from method names to DataFlows""" - def handle_init_class(self, *args, **kwargs) -> T: """Create an instance of the underlying class. Equivalent to `T.__init__(*args, **kwargs)`.""" return self.entity(*args, **kwargs) - def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any], state: T) -> dict[str, Any]: + def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, Any], state: T): """Invoke the method of the underlying class. The `cascade.dataflow.dataflow.InvokeMethod` object must contain a method identifier @@ -106,22 +124,27 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self._methods[method.method_name](variable_map=variable_map, state=state) + return self.methods[method.method_name].call_block(variable_map=variable_map, __state=state) + + def get_method_rw_set(self, method_name: str): + return super().get_method_rw_set(method_name) + def name(self): + return self.entity.__name__ -class StatelessMethodCall(Protocol): - def __call__(self, variable_map: dict[str, Any]) -> Any: ... - """@private""" class StatelessOperator(Operator): """A StatelessOperator refers to a stateless function and therefore only has one dataflow.""" - def __init__(self, methods: dict[str, StatelessMethodCall], dataflow: DataFlow): - self._methods = methods - self.dataflow = dataflow + def __init__(self, entity: Type, methods: dict[str, 'CompiledLocalBlock'], dataflows: dict['DataflowRef', 'DataFlow']): + self.entity = entity + # TODO: extract this from dataflows.blocks + self.methods = methods + self.dataflows = dataflows + pass - def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any]) -> dict[str, Any]: + def handle_invoke_method(self, method: 'InvokeMethod', variable_map: dict[str, Any]): """Invoke the method of the underlying class. The `cascade.dataflow.dataflow.InvokeMethod` object must contain a method identifier @@ -129,5 +152,13 @@ def handle_invoke_method(self, method: InvokeMethod, variable_map: dict[str, Any The state `T` is passed along to the function, and may be modified. """ - return self._methods[method.method_name](variable_map=variable_map) + return self.methods[method.method_name].call_block(variable_map=variable_map, __state=None) + + def get_method_rw_set(self, method_name: str): + return super().get_method_rw_set(method_name) + + def name(self) -> str: + # return "SomeStatelessOp" + return self.entity.__name__ + diff --git a/src/cascade/dataflow/optimization/dead_node_elim.py b/src/cascade/dataflow/optimization/dead_node_elim.py index d1a9d06..414bd6d 100644 --- a/src/cascade/dataflow/optimization/dead_node_elim.py +++ b/src/cascade/dataflow/optimization/dead_node_elim.py @@ -18,7 +18,10 @@ def is_no_op(func): return body in ("pass", "return") +# DEPRECATED as dead nodes are not commonly generated. +# However, some logic could be done for "flattening" calls in calls def dead_node_elimination(stateful_ops: list[StatefulOperator], stateless_ops: list[StatelessOperator]): + # Find dead functions dead_func_names = set() for op in stateful_ops: diff --git a/src/cascade/dataflow/optimization/parallelization.py b/src/cascade/dataflow/optimization/parallelization.py index 79e3ea4..95a7bf5 100644 --- a/src/cascade/dataflow/optimization/parallelization.py +++ b/src/cascade/dataflow/optimization/parallelization.py @@ -1,191 +1,194 @@ -""" -When is it safe to parallize nodes? - --> When they don't affect each other --> The simpelest way of doing it could be to run individual dataflows in parallel -(e.g. item.get_price() can run in parallel) --> must convey that we assume no side-affects, so the actual order of execution -does not matter. could go deeper and give a spec. --> some instructions from the same dataflow could also be completed in parallel? -maybe? like ILP. but might need to think of more contrived examples/do more -advanced program analyis. - -From Control Flow to Dataflow -3. Parallelizing Memory Operations -- operations on different memory locatiosn need not be sequentialized -- circulate a set of access tokens for each variable (=split function?) - - assume that every variable denotes a unique memory location (no aliasing) - -We have to be careful about certain types of parallelization. Consider the example: - -``` -# Calculate the average item price in basket: List[Item] -n = 0 -p = 0 -for item in basket: - n += 1 - p += item.price() -return p / n -``` - -In this example we would want to parallelize the calls to item.price(). -But we have to make sure the calls to `n += 1` remains bounded to the number of -items, even though there is no explicit data dependency. - - ----- - - -There is another type of optimization we could look at. -Suppose the following: - -``` -n = self.basket_size - -prices = [item.price() for item in self.basket] -total_price = sum(prices) - -return total_price / n -``` - -In this case, the variable n is not needed in the list comprehension - unoptimized -versions would generate an extra function instead of having the line be re-ordered -into the bottom function. Instead, analyis of the variables each function needs -access to would be a way to optimize these parts! - ---> Ask Soham about this! - -from "From control flow to dataflow" - -Consider the portion of control-flow graph between a node N and its *immediate -postdominator* P. Every control-flow path starting at N ultimately ends up at P. -Suppose that there is no reference to a variable x in any node on any path between -N and P. It is clear that an access token for x that enters N may bypass this -region of the graph altogether and go directly to P. - - ----- - -"Dataflow-Based Parallelization of Control-Flow Algorithms" - -loop invariant hoisting - -``` -i = 0 -while i < n: - x = y + z - a[i] = 6 * i + x * x - i += 1 -``` - -can be transformed in - -``` -i = 0 -if i < n: - x = y + z # loop invariant 1 - t1 = x * x # loop invariant 2 - do { # do while loop needed in case the conditional has side effects - a[i] = 6 * i + t1 - i += 1 - } while i < n -``` - -this is achieved using reaching definitions analysis. In the paper: -"It is a common optimization to pull those parts of a loop body -that depend on only static datasets outside of the loop, and thus -execute these parts only once [7 , 13 , 15 , 32 ]. However, launching -new dataflow jobs for every iteration step prevents this optimiza- -tion in the case of such binary operators where only one input is -static. For example, if a static dataset is used as the build-side of -a hash join, then the system should not rebuild the hash table at -every iteration step. Labyrinth operators can keep such a hash -table in their internal states between iteration steps. This is made -possible by implementing iterations as a single cyclic dataflow -job, where the lifetimes of operators span all the steps." -Is there a similair example we could leverage for cascade? one with a "static dataset" as loop invariant? -in spark, it's up to the programmer to .cache it - - -In this paper, they also use an intermediate representation of one "basic block" per node. -A "basic block" is a sequence of instructions that always execute one after the other, -in other words contains no control flow. Control flow is defined by the edges in the -dataflow graph that connect the nodes. - -There's also a slightly different focus of this paper. The focus is not on stateful -dataflows, and obviously the application is still focused on bigdata-like applications, -not ones were latency is key issue. - - -Basic Blocks - Aho, A. V., Sethi, R., and Ullman, J. D. Compilers: principles, techniques, and -tools, vol. 2. Addison-wesley Reading, 2007. -SSA - Rastello, F. SSA-based Compiler Design. Springer Publishing Company, -Incorporated, 2016. - - ----- - -ideas from "optimization of dataflows with UDFs:" - -we are basically making a DSL (integrated with python) which would allow for optimization -of UDFs!! this optimization is inside the intermediate representation, and not directly in -the target machine (similair to Emma, which uses a functional style *but* is a DSL (does it -allow for arbitrary scala code?)) - ---- - -our program is essentially a compiler. this allows to take inspiration from existing -works on compilation (which has existed for much longer than work on dataflows (?) - -actually, dataflows were more popular initially when people didn't settle on the von Neumann architecture yet, -see e.g. Monsoon (1990s) or the original control flow to dataflow paper. the popularisation and efficiency of tools -such as drayadlinq, apache spark, apache flink has reinvigorated the attention towards dataflows). -BUT compilers are often have hardware specific optimizations, based on the hardware instruction sets, or hardware-specifics -such as optimization of register allocation, cache line considerations etc etc. -The compiler in Cascade/other cf to df systems do not necessarily have the same considerations. This is because the backend -is software rather than hardware (e.g. we use flink + kafka). Since software is generally a lot more flexible than hardware, -we can instead impose certain considerations on the execution engine (which is now software, instead of a chip) rather than -the other way around (e.g. SIMD introduced --> compiler optimizations introduced). (to be fair, compiler design has had major influences [citation needed] on CPU design, but the point is that hardware iteration -is generally slower and more expensive than software iteration). - - ---- - -for certain optimizations, cascade assumes order of any side effects (such as file IO) does not matter. -otherwise a lot of parallelization operations would become much more costly due to the necessary synchronization issues. - ---- - -other optimization: code duplication - -this would remove nodes (assumption that less nodes = faster) at the cost of more computation per node. -a common example is something like this: - -``` -cost = item.price() -if cost > 30: - shipping_discount = discount_service.get_shipping_discount() - price = cost * shipping_discount -else: - price = cost - -return price -``` - -in this case the "return price" could be duplicated accross the two branches, -such that they don't need to return back to the function body. - ---- - -other ideas: - https://en.wikipedia.org/wiki/Optimizing_compiler#Specific_techniques -""" - -from cascade.dataflow.operator import StatefulOperator, StatelessOperator - - -def node_parallelization(stateful_ops: list[StatefulOperator], stateless_ops: list[StatelessOperator]): - # Find parallelizable nodes - for op in stateful_ops: - for dataflow in op.dataflows.values(): - pass - # Parallize them \ No newline at end of file +import copy +from dataclasses import dataclass +from typing import Any, Tuple +from cascade.dataflow.dataflow import CallRemote, CallLocal, CollectNode, DataFlow, Edge, IfNode, Node, Return +import cascade + +@dataclass +class AnnotatedNode: + node: Node + reads: set[str] + writes: set[str] + +def parallelize(df): + par, rest = parallelize_until_if(df) + + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for node in rest.nodes.values(): + par.add_node(node) + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + + par.name = df.name + "_parallel" + return par + +import networkx as nx +def parallelize_until_if(df: DataFlow) -> Tuple[DataFlow, DataFlow]: + """Parallelize df, stopping at the first if node. + The first dataflow returned is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" + # create the dependency graph + ans = [] + # since we use SSA, every variable has exactly one node that writes it + write_nodes = {} + graph = nx.DiGraph() + for node in df.nodes.values(): + if isinstance(node, CallRemote): + reads = set(node.variable_rename.values()) + writes = {result} if (result := node.assign_result_to) else set() + elif isinstance(node, CallLocal): + method = df.blocks[node.method.method_name] + reads = method.reads + writes = method.writes + elif isinstance(node, Return): + break + elif isinstance(node, IfNode): + break + else: + raise ValueError(f"unsupported node type: {type(node)}") + + write_nodes.update({var: node.id for var in writes}) + + ans.append(AnnotatedNode(node, reads, writes)) + graph.add_node(node.id) + + # Add the edges in the dependency graph + nodes_with_indegree_0 = set(graph.nodes) + n_map = copy.deepcopy(df.nodes) + for node in ans: + for read in node.reads: + if read in write_nodes: + # "read" will not be in write nodes if it is part of the arguments + # a more thorough implementation would not need the if check, + # and add the arguments as writes to some function entry node + graph.add_edge(write_nodes[read], node.node.id) + try: + nodes_with_indegree_0.remove(node.node.id) + except KeyError: + pass + + + + updated = DataFlow(df.name, df.operator_name) + # updated.blocks = df.blocks + updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] + + rest = copy.deepcopy(df) + + collectors = {} + finishers = set() + for u in graph.nodes: + updated.add_node(n_map[u]) + rest.remove_node_by_id(u) + if graph.in_degree(u) > 1: + c = CollectNode(0) + updated.add_node(c) + collectors[u] = c.id + updated.add_edge_refs(c.id, u) + elif graph.out_degree(u) == 0: + finishers.add(u) + + if len(finishers) > 1: + c = CollectNode(0) + updated.add_node(c) + for f in finishers: + c.num_events += 1 + updated.add_edge_refs(f, c.id) + + + for u, v in graph.edges: + if v in collectors: + v = collectors[v] + updated.nodes[v].num_events += 1 + + updated.add_edge_refs(u, v, None) + + + return updated, rest + +import networkx as nx +def parallelize_until_if_DEPRECATED(df: DataFlow) -> Tuple[DataFlow, DataFlow]: + """Parallelize df, stopping at the first if node. + The first dataflow returned is the parallelized dataflow up until the first if node. The second dataflow is the rest of the dataflow""" + # create the dependency graph + ans = [] + # since we use SSA, every variable has exactly one node that writes it + write_nodes = {} + graph = nx.DiGraph() + for node in df.nodes.values(): + if isinstance(node, CallRemote): + reads = set(node.variable_rename.values()) + writes = {result} if (result := node.assign_result_to) else set() + elif isinstance(node, CallLocal): + operator = cascade.core.operators[df.operator_name] + method = df.blocks[node.method.method_name] + reads = method.reads + writes = method.writes + elif isinstance(node, IfNode): + break + else: + raise ValueError(f"unsupported node type: {type(node)}") + + write_nodes.update({var: node.id for var in writes}) + + ans.append(AnnotatedNode(node, reads, writes)) + graph.add_node(node.id) + + # Add the edges in the dependency graph + # & generate the set of indegree 0 nodes + nodes_with_indegree_0 = set(graph.nodes) + n_map = copy.deepcopy(df.nodes) + for node in ans: + for read in node.reads: + if read in write_nodes: + # "read" will not be in write nodes if it is part of the arguments + # a more thorough implementation would not need the if check, + # and add the arguments as writes to some function entry node + graph.add_edge(write_nodes[read], node.node.id) + try: + nodes_with_indegree_0.remove(node.node.id) + except KeyError: + pass + + updated = DataFlow(df.name, df.operator_name) + updated.entry = [n_map[node_id] for node_id in nodes_with_indegree_0] + prev_node = None + + rest = copy.deepcopy(df) + + while len(nodes_with_indegree_0) > 0: + # remove nodes from graph + children = set() + for node_id in nodes_with_indegree_0: + children.update(graph.successors(node_id)) + graph.remove_node(node_id) + rest.remove_node(n_map[node_id]) + updated.add_node(n_map[node_id]) + + + # check for new indegree 0 nodes + next_nodes = set() + for child in children: + if graph.in_degree(child) == 0: + next_nodes.add(child) + + if len(nodes_with_indegree_0) > 1: + collect_node = CollectNode(len(nodes_with_indegree_0)) + for node_id in nodes_with_indegree_0: + if prev_node: + updated.add_edge(Edge(prev_node, n_map[node_id])) + updated.add_edge(Edge(n_map[node_id], collect_node)) + prev_node = collect_node + else: + node_id = nodes_with_indegree_0.pop() + if prev_node: + updated.add_edge(Edge(prev_node, n_map[node_id])) + + prev_node = n_map[node_id] + + nodes_with_indegree_0 = next_nodes + + return updated, rest diff --git a/src/cascade/dataflow/optimization/test_dead_node_elim.py b/src/cascade/dataflow/optimization/test_dead_node_elim.py index 94b30af..18dbe5e 100644 --- a/src/cascade/dataflow/optimization/test_dead_node_elim.py +++ b/src/cascade/dataflow/optimization/test_dead_node_elim.py @@ -1,6 +1,6 @@ from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode +from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod from cascade.dataflow.operator import StatefulOperator from cascade.dataflow.optimization.dead_node_elim import dead_node_elimination from cascade.dataflow.optimization.dead_node_elim import is_no_op @@ -66,10 +66,10 @@ def user_order_df(): df.entry = n0 return df -df = user_order_df() -user_op.dataflows[df.name] = df +# df = user_order_df() +# user_op.dataflows[df.name] = df -def test_dead_node_elim(): +def DEPRECATED_test_dead_node_elim(): print(user_op.dataflows[df.name].to_dot()) dead_node_elimination([user_op], []) diff --git a/src/cascade/dataflow/test_dataflow.py b/src/cascade/dataflow/test_dataflow.py deleted file mode 100644 index a5b42af..0000000 --- a/src/cascade/dataflow/test_dataflow.py +++ /dev/null @@ -1,132 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, Event, EventResult, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator - -class DummyUser: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance - - def buy_item(self, item: 'DummyItem') -> bool: - item_price = item.get_price() # SSA - self.balance -= item_price - return self.balance >= 0 - -def buy_item_0_compiled(variable_map: dict[str, Any], state: DummyUser): - return - -def buy_item_1_compiled(variable_map: dict[str, Any], state: DummyUser): - state.balance -= variable_map["item_price"] - return state.balance >= 0 - -class DummyItem: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price - - def get_price(self) -> int: - return self.price - -def get_price_compiled(variable_map: dict[str, Any], state: DummyItem): - return state.price - -################## TESTS ####################### - -user = DummyUser("user", 100) -item = DummyItem("fork", 5) - -user_sop = StatefulOperator(DummyUser, - {"buy_item_0": buy_item_0_compiled, - "buy_item_1": buy_item_1_compiled}, None) - - -def test_simple_df_propogation(): - df = DataFlow("user.buy_item") - n1 = OpNode(DummyUser, InvokeMethod("buy_item_0_compiled"), read_key_from="user_key") - n2 = OpNode(DummyItem, InvokeMethod("get_price"), read_key_from="item_key", assign_result_to="item_price") - n3 = OpNode(DummyUser, InvokeMethod("buy_item_1"), read_key_from="user_key") - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - - user.buy_item(item) - event = Event(n1, {"user_key": "user", "item_key":"fork"}, df) - - # Manually propogate - item_key = buy_item_0_compiled(event.variable_map, state=user) - next_event = event.propogate(event, item_key) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n2 - event = next_event[0] - - # manually add the price to the variable map - item_price = get_price_compiled(event.variable_map, state=item) - assert n2.assign_result_to - event.variable_map[n2.assign_result_to] = item_price - - next_event = event.propogate(item_price) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n3 - event = next_event[0] - - positive_balance = buy_item_1_compiled(event.variable_map, state=user) - next_event = event.propogate(None) - assert isinstance(next_event, EventResult) - - -def test_merge_df_propogation(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(DummyUser, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n3 = CollectNode(assign_result_to="item_prices", read_results_from="item_price") - n1 = OpNode( - DummyItem, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 0), - read_key_from="item_1_key" - ) - n2 = OpNode( - DummyItem, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 1), - read_key_from="item_2_key" - ) - n4 = OpNode(DummyUser, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n3)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - - # User with key "foo" buys items with keys "fork" and "spoon" - event = Event(n0, {"user_key": "foo", "item_1_key": "fork", "item_2_key": "spoon"}, df) - - # Propogate the event (without actually doing any calculation) - # Normally, the key_stack should've been updated by the runtime here: - next_event = event.propogate(None) - - assert isinstance(next_event, list) - assert len(next_event) == 2 - assert next_event[0].target == n1 - assert next_event[1].target == n2 - - event1, event2 = next_event - next_event = event1.propogate(None) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n3 - - next_event = event2.propogate(None) - - assert isinstance(next_event, list) - assert len(next_event) == 1 - assert next_event[0].target == n3 - - final_event = next_event[0].propogate(None) - assert isinstance(final_event, list) - assert final_event[0].target == n4 diff --git a/src/cascade/descriptors/class_descriptor.py b/src/cascade/descriptors/class_descriptor.py index 40271a4..efd6277 100644 --- a/src/cascade/descriptors/class_descriptor.py +++ b/src/cascade/descriptors/class_descriptor.py @@ -1,8 +1,10 @@ +from typing import Any, Optional from klara.core import nodes -from cascade.frontend.ast_visitors import ExtractClassDefNode, ExtractMethodVisitor from cascade.descriptors.method_descriptor import MethodDescriptor +from cascade.frontend.ast_visitors.extract_class_def_node import ExtractClassDefNode +from cascade.frontend.ast_visitors.extract_class_methods import ExtractMethodVisitor class ClassDescriptor: """A description of a class.""" @@ -13,18 +15,26 @@ def __init__( module_node: nodes.Module, class_node: nodes.ClassDef, methods_dec: list[MethodDescriptor], + globals: Optional[dict[str, Any]] ): self.class_name: str = class_name self.module_node: nodes.Module = module_node self.class_node: nodes.ClassDef = class_node self.methods_dec: list[MethodDescriptor] = methods_dec + self.globals = globals + + self.is_stateless = True + for method in methods_dec: + if method.method_name == "__init__": + self.is_stateless = False + break def get_method_by_name(self, name: str): return next(m for m in self.methods_dec if m.method_name == name) @classmethod - def from_module(cls, class_name: str, module_node: nodes.Module): + def from_module(cls, class_name: str, module_node: nodes.Module, globals): class_node: nodes.ClassDef = ExtractClassDefNode.extract(module_node, class_name) method_dec: list[MethodDescriptor] = ExtractMethodVisitor.extract(class_node) - c = cls(class_name, module_node, class_node, method_dec) + c = cls(class_name, module_node, class_node, method_dec, globals) return c diff --git a/src/cascade/descriptors/method_descriptor.py b/src/cascade/descriptors/method_descriptor.py index 9f4b4aa..9c367f5 100644 --- a/src/cascade/descriptors/method_descriptor.py +++ b/src/cascade/descriptors/method_descriptor.py @@ -1,7 +1,7 @@ from klara.core import nodes -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder -from cascade.frontend.intermediate_representation import StatementDataflowGraph +from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder +from cascade.frontend.cfg import ControlFlowGraph class MethodDescriptor: @@ -14,11 +14,11 @@ def __init__( ): self.method_name: str = method_name self.method_node: nodes.FunctionDef = method_node - self.dataflow: StatementDataflowGraph = None + self.dataflow: ControlFlowGraph = None def build_dataflow(self): statements = [self.method_node] + self.method_node.body - dataflow_graph: StatementDataflowGraph = DataflowGraphBuilder.build(statements) + dataflow_graph: ControlFlowGraph = ControlFlowGraphBuilder.build(statements) dataflow_graph.set_name(self.method_name) self.dataflow = dataflow_graph diff --git a/src/cascade/frontend/ast_visitors/__init__.py b/src/cascade/frontend/ast_visitors/__init__.py index 32feb97..e69de29 100644 --- a/src/cascade/frontend/ast_visitors/__init__.py +++ b/src/cascade/frontend/ast_visitors/__init__.py @@ -1,5 +0,0 @@ -from .extract_type_visitor import ExtractTypeVisitor -from .contains_attribute_visitor import ContainsAttributeVisitor -from .variable_getter import VariableGetter -from .extract_class_def_node import ExtractClassDefNode -from .extract_class_methods import ExtractMethodVisitor \ No newline at end of file diff --git a/src/cascade/frontend/ast_visitors/extract_type_visitor.py b/src/cascade/frontend/ast_visitors/extract_type_visitor.py index 3634910..3be142c 100644 --- a/src/cascade/frontend/ast_visitors/extract_type_visitor.py +++ b/src/cascade/frontend/ast_visitors/extract_type_visitor.py @@ -1,7 +1,7 @@ from klara.core.ssa_visitors import AstVisitor from klara.core.nodes import AnnAssign, Arg from klara.core import nodes - +from klara.core.node_classes import Name class ExtractTypeVisitor(AstVisitor): @@ -23,11 +23,11 @@ def visit_annassign(self, node: AnnAssign): def visit_arg(self, arg: Arg): annotation = arg.annotation var_type = type(annotation) + # TODO: Find a better way to get the SSA version from Arg + id: str = arg.arg + "_0" if var_type == nodes.Const: - id: str = arg.arg self.type_map[id] = annotation.value elif annotation != None: - id: str = arg.arg self.type_map[id] = str(annotation.id) def get_type_map(self) -> dict[str, str]: diff --git a/src/cascade/frontend/ast_visitors/replace_name.py b/src/cascade/frontend/ast_visitors/replace_name.py index 26061ea..2b6a4a7 100644 --- a/src/cascade/frontend/ast_visitors/replace_name.py +++ b/src/cascade/frontend/ast_visitors/replace_name.py @@ -1,21 +1,82 @@ +from typing import Union from klara.core.ssa_visitors import AstVisitor from klara.core import nodes -class ReplaceName(AstVisitor): - """get all variables (ast.name) from given node, separate by targets and values - +class ReplaceSelfWithState(AstVisitor): + """Replace attributes with "self" into "__state", and remove SSA versioning. + + e.g.: + self_0.balance_0 -> __state['balance'] """ - def __init__(self, target: str, new: str): - self.target: str = target - self.new: str = new + def __init__(self): + self.target: str = "self" + self.new: str = "__state" @classmethod - def replace(cls, node, target: str, new: str): - c = cls(target, new) + def replace(cls, node): + c = cls() c.visit(node) return c - def visit_name(self, node: nodes.Name): - if node.id == self.target: - node.id = self.new \ No newline at end of file + def replace_name(self, node: nodes.Name): + node.id = self.new + node.version = -1 + + def replace_node(self, parent: nodes.BaseNode, old_node: nodes.BaseNode, new_node: nodes.BaseNode): + # get node children + for field in parent._fields: + attr = getattr(parent, field) + if isinstance(attr, (tuple, list)): + to_change = None + for i, n in enumerate(attr): + if n == old_node: + to_change = i + + if to_change is not None: + if isinstance(attr[i], tuple): + new_attr = list(attr) + new_attr[i] = new_node + attr = tuple(new_attr) + else: + attr[i] = new_node + setattr(parent, field, attr) + else: + if attr is not None: + if attr == old_node: + setattr(parent, field, new_node) + else: + continue + + + def replace_attribute(self, node: Union[nodes.Attribute, nodes.AssignAttribute]): + # change self -> state + node.value.id = self.new + node.value.version = -1 + + # change attribute to subscript + new_node = nodes.Subscript(node.lineno, None, node.parent, node.links, version=-1) + slice = nodes.Index(new_node.lineno, None, new_node) + slice.postinit(nodes.Const(node.attr, slice.lineno, slice.col_offset, slice)) + new_node.postinit(node.value, slice, node.ctx) + assert isinstance(node.parent, nodes.BaseNode) + self.replace_node(node.parent, node, new_node) + + + def visit_subscript(self, node: nodes.Subscript): + # e.g. self_0.data["something"]_0 -> state.data["something"] + if isinstance(node.value, nodes.Attribute): + attr = node.value + if str(attr.value) == self.target: + self.replace_attribute(attr) + node.version = -1 + + def visit_assignattribute(self, node: nodes.AssignAttribute): + if str(node.value) == self.target: + self.replace_attribute(node) + + + def visit_attribute(self, node: nodes.Attribute): + if str(node.value) == self.target: + self.replace_attribute(node) + diff --git a/src/cascade/frontend/ast_visitors/simplify_returns.py b/src/cascade/frontend/ast_visitors/simplify_returns.py new file mode 100644 index 0000000..443d1b9 --- /dev/null +++ b/src/cascade/frontend/ast_visitors/simplify_returns.py @@ -0,0 +1,51 @@ +from klara.core.ssa_visitors import AstVisitor +from klara.core import nodes + +def simplify_returns(node): + sr = SimplifyReturns.replace(node) + for parent, n, target in sr.inserts: + try: + i = parent.body.index(n) + parent.body.insert(i, target) + except ValueError as e: + if isinstance(parent, nodes.If): + i = parent.orelse.index(n) + parent.orelse.insert(i, target) + else: + raise e + +class SimplifyReturns(AstVisitor): + """Replace attributes with "self" into "state", and remove SSA versioning. + + e.g.: + self_0.balance_0 -> state.balance + """ + + def __init__(self): + self.temps = 0 + self.inserts = [] + + @classmethod + def replace(cls, node): + c = cls() + c.visit(node) + return c + + def replace_name(self, node: nodes.Return): + new_assign = nodes.Assign(parent=node.parent, lineno=node.lineno) + target = nodes.AssignName(parent=new_assign) + target.postinit(id=f"__ret_{self.temps}") + self.temps += 1 + new_assign.postinit(targets=[target], value=node.value) + node.value = nodes.Name() + node.value.postinit(target.id) + + assert hasattr(node.parent, "body"), type(node.parent) + print(f"replacing {node} in {node.parent} with {new_assign}") + self.inserts.append((node.parent, node, new_assign)) + + def visit_return(self, node: nodes.Return): + + if not isinstance(node.value, nodes.Name): + self.replace_name(node) + diff --git a/src/cascade/frontend/ast_visitors/variable_getter.py b/src/cascade/frontend/ast_visitors/variable_getter.py index 8a8300b..97b7b8c 100644 --- a/src/cascade/frontend/ast_visitors/variable_getter.py +++ b/src/cascade/frontend/ast_visitors/variable_getter.py @@ -1,4 +1,5 @@ from klara.core.ssa_visitors import AstVisitor +from klara.core import nodes class VariableGetter(AstVisitor): """get all variables (ast.name) from given node, separate by targets and values @@ -21,3 +22,6 @@ def visit_name(self, node): def visit_assignname(self, node): self.targets.append(node) + + def visit_assignattribute(self, node: nodes.AssignAttribute): + self.targets.append(node.value) diff --git a/src/cascade/frontend/cfg/__init__.py b/src/cascade/frontend/cfg/__init__.py new file mode 100644 index 0000000..5da7d36 --- /dev/null +++ b/src/cascade/frontend/cfg/__init__.py @@ -0,0 +1,2 @@ +from .control_flow_graph import ControlFlowGraph +from .statement import Statement \ No newline at end of file diff --git a/src/cascade/frontend/cfg/cfg_builder.py b/src/cascade/frontend/cfg/cfg_builder.py new file mode 100644 index 0000000..29aa63d --- /dev/null +++ b/src/cascade/frontend/cfg/cfg_builder.py @@ -0,0 +1,72 @@ +from klara.core.cfg import ModuleLabel, TempAssignBlock +from klara.core import nodes + +from cascade.frontend.ast_visitors.contains_attribute_visitor import ContainsAttributeVisitor +from cascade.frontend.ast_visitors.variable_getter import VariableGetter +from cascade.frontend.cfg import Statement, ControlFlowGraph + + +class ControlFlowGraphBuilder: + + def __init__(self, block_list: list, globals: list[str]): + self.block_list: list = block_list + self.globals = globals + + def make_cfg(self, blocks: list, i = 0) -> tuple[ControlFlowGraph, int]: + graph = ControlFlowGraph() + for b in blocks: + if type(b) in [ModuleLabel, TempAssignBlock]: + continue + elif type(b) == nodes.FunctionDef: + statement = Statement(i, b) + i += 1 + args = b.args + function_vars = [f'{a.arg}_0' for a in args.args] + statement.extend_targets(function_vars) + statement.extend_values(function_vars) + graph.append_statement(statement) + elif type(b) == nodes.If: + + # Make subgraph of both branches + subgraph_body, i = self.make_cfg(b.body, i) + subgraph_orelse, i = self.make_cfg(b.orelse, i) + cond = Statement(i, b.test, is_predicate=True) + i += 1 + + # Add condition & branches to graph + graph.append_statement(cond) + graph.append_subgraph(cond, subgraph_body, type=True) + graph.append_subgraph(cond, subgraph_orelse, type=False) + + if subgraph_orelse.graph.number_of_nodes() == 0: + raise NotImplementedError("dataflow structure for if without else is not correct yet") + + # The next node should connect to both subgraph + graph._last_node = subgraph_body._last_node + subgraph_orelse._last_node + else: + statement = Statement(i, b) + i += 1 + graph.append_statement(statement) + variable_getter = VariableGetter.get_variable(b) + targets, values = variable_getter.targets, variable_getter.values + statement.targets = [t.__repr__() for t in targets] + statement.values = [v.__repr__() for v in values] + contains_attribute, attribute = ContainsAttributeVisitor.check_return_attribute(b) + if contains_attribute: + if attribute.value.id in self.globals: + statement.values.remove(attribute.value.id) + elif attribute.value.id != 'self': + statement.set_remote() + + statement.set_attribute(attribute) + + return graph, i + + def construct_dataflow_graph(self) -> ControlFlowGraph: + graph, _ = self.make_cfg(self.block_list) + return graph + + @classmethod + def build(cls, block_list: list, globals: list[str]) -> ControlFlowGraph: + dataflow_graph_builder = cls(block_list, globals) + return dataflow_graph_builder.construct_dataflow_graph() diff --git a/src/cascade/frontend/cfg/control_flow_graph.py b/src/cascade/frontend/cfg/control_flow_graph.py new file mode 100644 index 0000000..af26c69 --- /dev/null +++ b/src/cascade/frontend/cfg/control_flow_graph.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass +from typing import Iterable, Optional +import networkx as nx + +from cascade.frontend.generator.unparser import unparse +from cascade.frontend.cfg.statement import Statement + + +@dataclass +class ControlFlowGraph: + """Control Flow Graph represented as a directed graph. + + Nodes are Statements, and edges are either PO/True/False. + """ + graph: nx.DiGraph + instance_type_map: dict[str, str] = None # {"instance_name": "EntityType"} + method_name: str = None + _last_node: list[Statement] = None + _sources: list[Statement] = None + + def __init__(self): + self.graph = nx.DiGraph() + self._sources = [] + self._last_node = [] + + def set_name(self, name: str): + self.name = name + + def append_statement(self, node: Statement): + self.graph.add_node(node) + + if len(self._sources) == 0: + self._sources = [node] + + for ln in self._last_node: + self.graph.add_edge(ln, node) + self._last_node = [node] + + + def append_subgraph(self, to_node: Statement, subgraph: 'ControlFlowGraph', **edge_attr): + if subgraph.graph.number_of_nodes() == 0: + return + for node in subgraph.get_nodes(): + self.graph.add_node(node) + for edge in subgraph.graph.edges: + self.graph.add_edge(edge[0], edge[1]) + assert len((s:=subgraph.get_source_nodes())) == 1 + self.graph.add_edge(to_node, s[0], **edge_attr) + + def remove_node(self, node: Statement): + """Remove a node and it's adjacent edges""" + if node == self.get_single_source(): + succ = list(self.graph.successors(node)) + # assert len(succ) <= 1, "Can't remove node with more than one successor" + self._sources = succ + if node == self._last_node: + raise NotImplementedError("Update last node") + + self.graph.remove_node(node) + + def get_single_source(self,) -> Optional[Statement]: + """Get the source of this CFG. Returns None if there are 0 or 2+ sources.""" + if len(self._sources) == 1: + return self._sources[0] + else: + return None + + def get_single_successor(self, node: Statement) -> Optional[Statement]: + """Get the successor of this node. Returns None if there are 0 or 2+ successors.""" + succ = list(self.graph.successors(node)) + if len(succ) == 1: + return succ[0] + else: + return None + + def get_nodes(self) -> Iterable[Statement]: + return self.graph.nodes + + def get_edges(self) -> Iterable[tuple[int, int]]: + return [(u.block_num, v.block_num) for u, v in self.graph.edges] + + def get_source_nodes(self) -> list[Statement]: + return self._sources + + def to_dot(self) -> str: + dot_string = "digraph CFG {\n" + + # Add nodes + for node in self.get_nodes(): + dot_string += f' {node.block_num} [label="{unparse(node.block)}"];\n' + + # Add edges + for source, target, type in self.graph.edges.data('type', default='po'): + dot_string += f' {source.block_num} -> {target.block_num} [label="{type}"];\n' + + dot_string += "}" + return dot_string diff --git a/src/cascade/frontend/intermediate_representation/statement.py b/src/cascade/frontend/cfg/statement.py similarity index 84% rename from src/cascade/frontend/intermediate_representation/statement.py rename to src/cascade/frontend/cfg/statement.py index e20db42..8185e61 100644 --- a/src/cascade/frontend/intermediate_representation/statement.py +++ b/src/cascade/frontend/cfg/statement.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from klara.core.cfg import RawBasicBlock -from klara.core.nodes import Attribute +from klara.core.nodes import Attribute, Return @dataclass class Statement: @@ -10,6 +10,7 @@ class Statement: targets: list[str] = field(default_factory=list) values: list[str] = field(default_factory=list) remote_call: bool = False + is_predicate: bool = False attribute: Attribute = None def extend_targets(self, new_targets: list[str]): @@ -31,5 +32,8 @@ def set_attribute(self, attribute: Attribute): def is_remote(self) -> bool: return self.remote_call + def is_return(self) -> bool: + return isinstance(self.block, Return) + def __hash__(self): return hash(self.block_num) diff --git a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py b/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py deleted file mode 100644 index 51bd9dc..0000000 --- a/src/cascade/frontend/dataflow_analysis/dataflow_graph_builder.py +++ /dev/null @@ -1,64 +0,0 @@ -import networkx as nx - - -from klara.core.cfg import ModuleLabel, TempAssignBlock -from klara.core.nodes import Name, FunctionDef - -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph -from cascade.frontend.ast_visitors import ContainsAttributeVisitor, VariableGetter - - -class DataflowGraphBuilder: - - def __init__(self, block_list: list): - self.block_list: list = block_list - - def extract_statment_list(self): - # TODO: This one should be extended with recursion to handle if/else branches - statements = [] - i = 0 - for b in self.block_list: - if type(b) in [ModuleLabel, TempAssignBlock]: - continue - elif type(b) == FunctionDef: - b: FunctionDef - statement = Statement(i, b) - i += 1 - args = b.args - function_vars = [Name.quick_build(f'{a.arg}_0') for a in args.args] - statement.extend_targets(function_vars) - statement.extend_values(function_vars) - statements.append(statement) - else: - statement = Statement(i, b) - i += 1 - statements.append(statement) - variable_getter = VariableGetter.get_variable(b) - targets, values = variable_getter.targets, variable_getter.values - statement.targets = targets - statement.values = values - contains_attribute, attribute = ContainsAttributeVisitor.check_return_attribute(b) - if contains_attribute: - if attribute.value.id != 'self': - statement.set_remote() - - statement.set_attribute(attribute) - return statements - - def construct_dataflow_graph(self) -> StatementDataflowGraph: - statements = self.extract_statment_list() - G = nx.DiGraph() - for b1 in statements: - G.add_node(b1) - for b2 in statements: - if b1.block_num != b2.block_num: - targets = set(repr(b) for b in b1.targets) - values = set(repr(b) for b in b2.values) - if targets.intersection(values): - G.add_edge(b1, b2) - return StatementDataflowGraph(G) - - @classmethod - def build(cls, block_list: list) -> StatementDataflowGraph: - dataflow_graph_builder = cls(block_list) - return dataflow_graph_builder.construct_dataflow_graph() diff --git a/src/cascade/frontend/generator/build_compiled_method_string.py b/src/cascade/frontend/generator/build_compiled_method_string.py deleted file mode 100644 index 2cd709f..0000000 --- a/src/cascade/frontend/generator/build_compiled_method_string.py +++ /dev/null @@ -1,19 +0,0 @@ -from cascade.frontend.generator.split_function import SplitFunction - - -class BuildCompiledMethodsString: - - def __init__(self, splits: list[SplitFunction]): - self.splits: list[SplitFunction] = splits - - def make_splitfunctions(self) -> list[str]: - bodies = [] - for split in self.splits: - body = split.to_string() - bodies.append(body) - return '\n\n'.join(bodies) - - @classmethod - def build(cls, splits: list[SplitFunction]): - cls = cls(splits) - return cls.make_splitfunctions() diff --git a/src/cascade/frontend/generator/dataflow_builder.py b/src/cascade/frontend/generator/dataflow_builder.py new file mode 100644 index 0000000..437d542 --- /dev/null +++ b/src/cascade/frontend/generator/dataflow_builder.py @@ -0,0 +1,251 @@ +from typing import Any, Optional +import networkx as nx + +from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode, Return +from cascade.frontend.ast_visitors.extract_type_visitor import ExtractTypeVisitor +from cascade.frontend.ast_visitors.simplify_returns import SimplifyReturns +from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder +from cascade.frontend.cfg import Statement, ControlFlowGraph +from cascade.frontend.generator.local_block import LocalBlock, to_entity_call + + +from klara.core import nodes + +def split_statements_once(statements: list[Statement]) -> tuple[list[Statement], list[Statement]]: + """ + Split a list of statements, by grouping together statements that are not remote calls. + + As an example, suppose r and s are both statements, where r is a remote call and s is not. + + Here is how the list gets split: + [r, s, s, r, s] -> [r] + [s, s, r, s] + [s, s, r, s, s] -> [s, s] + [r, s, s] + [s, s, s] -> [s, s, s] + [] + """ + assert len(statements) > 0 + + if statements[0].is_remote() or statements[0].is_return(): + return [statements[0]], statements[1:] + + # find the next remote call + i = 0 + first_half = [] + while i < len(statements) and not statements[i].is_remote() and not statements[i].is_return(): + first_half.append(statements[i]) + i += 1 + + continuation = statements[i:] + return first_half, continuation + +def split_statements(statements: list[Statement]) -> list[tuple[Statement,...]]: + grouped_statements = [] + continuation = statements + while len(continuation) > 0: + first_half, continuation = split_statements_once(continuation) + grouped_statements.append(tuple(first_half)) + + return grouped_statements + +def split_cfg(blocked_statement_graph: nx.DiGraph) -> nx.DiGraph: + split_graph: nx.DiGraph = blocked_statement_graph.copy() + for node in list(split_graph.nodes): + in_nodes = split_graph.predecessors(node) + in_edges = list(split_graph.in_edges(node, data=True)) + out_edges = list(split_graph.out_edges(node, data=True)) + out_nodes = split_graph.successors(node) + + # create the new nodes + new_nodes = split_statements(list(node)) + split_graph.remove_node(node) + split_graph.add_nodes_from(new_nodes) + + # connect the inner edges + u = new_nodes[0] + for v in new_nodes[1:]: + split_graph.add_edge(u, v) + u = v + + # connect the outer edges + for u, v, ddict in in_edges: + split_graph.add_edge(u, new_nodes[0], **ddict) + for u, v, ddict in out_edges: + split_graph.add_edge(new_nodes[-1], v, **ddict) + + return split_graph + + +def blocked_cfg(statement_graph: nx.DiGraph, entry: Statement) -> nx.DiGraph: + """Transform a cfg (digraph of Statements) into a blocked version, i.e. a + digraph of tuple(Statements). This pass blocks together the body and orelse + branches of if blocks, grouping them together. + This pass treats remote calls as any other statement. + + For example, take the cfg of the following program: + + ``` + a = 10 + b = 20 + if x: + c = 30 + d = 20 + else: + e = 10 + f = 10 + ``` + + it will get split into the following blocks: + + ``` + block 1: + a = 10 + b = 20 + if x: + + block 2: + c = 30 + d = 20 + + block 3: + e = 10 + + block 4: + f = 10 + ``` + """ + + + grouped_statements = [entry] + + succ = list(statement_graph.successors(entry)) + while len(succ) == 1: + if len(list(statement_graph.predecessors(succ[0]))) > 1: + break + grouped_statements.append(succ[0]) + succ = list(statement_graph.successors(succ[0])) + + + graph = nx.DiGraph() + + if len(succ) == 0 or len(succ) == 1: + last_node = tuple(grouped_statements) + graph.add_node(last_node) + return graph + elif len(succ) == 2: + if len(grouped_statements) > 1: + before_if, last_node = tuple(grouped_statements[:-1]), tuple([grouped_statements[-1]]) + graph.add_edge(before_if, last_node) + else: + last_node = tuple(grouped_statements) + graph.add_node(last_node) + # TODO: check that then corresponds to "true" path + first_then, first_orelse = succ + then_blocked_graph = blocked_cfg(statement_graph, first_then) + orelse_blocked_graph = blocked_cfg(statement_graph, first_orelse) + last_then = list(then_blocked_graph.nodes)[-1] + last_orelse = list(orelse_blocked_graph.nodes)[-1] + + # check the first node after completed + succ_then = list(statement_graph.successors(last_then[-1])) + succ_orelse = list(statement_graph.successors(last_orelse[-1])) + + if len(succ_then) == 1 and len(succ_orelse) == 1: + assert succ_orelse[0] == succ_then[0] + + assert len(succ_then) <= 1 + assert len(succ_orelse) <= 1 + + + + # add then and orelse blocks + graph.add_edges_from(then_blocked_graph.edges()) + graph.add_edges_from(orelse_blocked_graph.edges()) + + # connect them to this node + first_then = list(then_blocked_graph.nodes)[0] + first_orelse = list(orelse_blocked_graph.nodes)[0] + graph.add_edge(last_node, first_then, type=True) + graph.add_edge(last_node, first_orelse, type=False) + + # connect the rest of the graph at the end (recursively) + if len(succ_then) == 1 or len(succ_orelse) == 1: + try: + first_finally = succ_orelse[0] + except IndexError: + first_finally = succ_then[0] + finally_graph = blocked_cfg(statement_graph, first_finally) + graph.add_edges_from(finally_graph.edges()) + first_finally = list(finally_graph.nodes)[0] + + graph.add_edge(last_then, first_finally) + graph.add_edge(last_orelse, first_finally) + + return graph + else: + raise ValueError(f"We expect a CFG node to have max 2 successors, got {succ}") + + + +class DataflowBuilder: + def __init__(self, function_def: nodes.FunctionDef, globals: Optional[dict[str, Any]] = None): + self.function_def = function_def + self.name = self.function_def.name + self.globals = globals + + + def build_cfg(self): + global_names = list(self.globals.keys()) if self.globals else [] + cfg: ControlFlowGraph = ControlFlowGraphBuilder.build([self.function_def] + self.function_def.body, global_names) + self.type_map = ExtractTypeVisitor.extract(self.function_def) + cfg.name = self.function_def.name + + entry_node: Statement = cfg.get_source_nodes()[0] + assert type(entry_node.block) == nodes.FunctionDef + cfg.remove_node(entry_node) + self.cfg = cfg + + self.blocked_cfg = split_cfg(blocked_cfg(cfg.graph, cfg.get_single_source())) + + def build_df(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: + df_ref = DataflowRef(op_name, self.name) + df = dataflows[df_ref] + + node_id_map = {} + + block_num = 0 + is_entry = True + for statement_block in self.blocked_cfg.nodes: + if len(statement_block) == 1 and statement_block[0].is_remote(): + node = to_entity_call(statement_block[0], self.type_map, dataflows) + elif len(statement_block) == 1 and statement_block[0].is_predicate: + rawblock = statement_block[0].block + assert isinstance(rawblock, nodes.Bool), type(rawblock) + node = IfNode(repr(rawblock.value)) + elif len(statement_block) == 1 and statement_block[0].is_return(): + rawblock = statement_block[0].block + assert isinstance(rawblock.value, nodes.Name), f"Return values must be simple names, not {type(rawblock.value)}: {repr(rawblock.value)}" + node = Return(repr(rawblock.value)) + else: + block = LocalBlock(list(statement_block), self.name, block_num, op_name, self.globals) + block_num += 1 + node = block.to_node() + df.add_block(block.compile()) + node_id_map[statement_block] = node.id + df.add_node(node) + + if is_entry: + df.entry = [node] + is_entry = False + + for source, target, if_result in self.blocked_cfg.edges.data('type', default=None): + source_id = node_id_map[source] + target_id = node_id_map[target] + df.add_edge_refs(source_id, target_id, if_result) + + return df + + + def build(self, dataflows: dict[DataflowRef, DataFlow], op_name: str) -> DataFlow: + self.build_cfg() + + return self.build_df(dataflows, op_name) + diff --git a/src/cascade/frontend/generator/generate_dataflow.py b/src/cascade/frontend/generator/generate_dataflow.py deleted file mode 100644 index 5bb1182..0000000 --- a/src/cascade/frontend/generator/generate_dataflow.py +++ /dev/null @@ -1,53 +0,0 @@ -from cascade.frontend.generator.split_function import SplitFunction -from cascade.dataflow.dataflow import DataFlow, OpNode, InvokeMethod, Edge - - -class GenerateDataflow: - """ Generates dataflow - """ - - def __init__(self, split_functions: list[SplitFunction], instance_type_map: dict[str, str]): - #TODO: add buildcontext that contains class name and target method - self.split_functions = split_functions - class_name = "class_name" # TODO: remove placeholder - self.df = DataFlow(class_name) - self.instance_type_map = instance_type_map - - def generate_dataflow(self): - self.extract_remote_method_calls() - self.build_dataflow() - - def build_dataflow(self): - """ Every remote function invocation should add the node - """ - nodes = [] - for split in self.split_functions: - node = OpNode(split.class_name, InvokeMethod(split.method_name)) - self.df.add_node(node) - nodes.append([node]) - - if split.remote_calls: - # TODO: instance_name -> correct entity (maybe using buildcontext/ instance type map) - next_nodes = [OpNode(self.instance_type_map[remote.instance_name], InvokeMethod(remote.attribute), assign_result_to=remote.target) - for remote in split.remote_calls] - nodes.append(next_nodes) - - self.df.entry = nodes[0][0] - for i in range(len(nodes)-1): - # TODO: add merge nodes - prev_nodes = nodes[i] - next_nodes = nodes[i+1] - for n in prev_nodes: - for v in next_nodes: - # TODO: Add variable map (think that should be the aggregation of the targets) - self.df.add_edge(Edge(n, v)) - - def extract_remote_method_calls(self): - for split in self.split_functions: - split.extract_remote_method_calls() - - @classmethod - def generate(cls, split_functions: list[SplitFunction], instance_type_map: dict[str, str]) -> DataFlow: - c = cls(split_functions, instance_type_map) - c.generate_dataflow() - return c.df \ No newline at end of file diff --git a/src/cascade/frontend/generator/generate_split_functions.py b/src/cascade/frontend/generator/generate_split_functions.py deleted file mode 100644 index c90a9a6..0000000 --- a/src/cascade/frontend/generator/generate_split_functions.py +++ /dev/null @@ -1,101 +0,0 @@ -from itertools import count - -import networkx as nx - -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph -from cascade.frontend.generator.split_function import SplitFunction - - -from klara.core import nodes -from klara.core.cfg import RawBasicBlock - -class GenerateSplittFunctions: - - def __init__(self, dataflow_graph: StatementDataflowGraph, class_name: str, entities: list[str], instance_type_map: dict[str, str]): - self.dataflow_graph: StatementDataflowGraph = dataflow_graph - self.class_name: str = class_name - self.entities: list[str] = entities - self.instance_type_map: dict[str, str] = instance_type_map # {"instance_name": "EntityType"} - self.dataflow_node_map = dict() - self.counter = count() - self.split_functions = [] - - def generate_split_functions(self): - G = self.dataflow_graph.graph - entry_node: Statement = next(iter(G.nodes)) - assert type(entry_node.block) == nodes.FunctionDef - # targets = copy.copy(entry_node.targets) - continuation = list(G.nodes) - while self.invokes_remote_entity(continuation): - first_half, continuation = self.split_fuction(G) - self.add_split_function(first_half) - G = G.subgraph(continuation) - # TODO: Add a new source node to continuation - self.add_split_function(continuation) - - def add_split_function(self, statements: list[Statement]): - targets, values = set(), set() - for s in statements: - targets.update(repr(v) for v in s.targets) - if s.is_remote() or type(s.block) != nodes.FunctionDef: - values.update(repr(v) for v in s.values if not self.value_is_entity(v)) - i: int = next(self.counter) - method_name = f'{self.dataflow_graph.name}_{i}' - split_f: SplitFunction = SplitFunction(i, method_name, statements, targets=targets, values=values, class_name=self.class_name) - self.split_functions.append(split_f) - - def value_is_entity(self, value: nodes.Name) -> bool: - value_id = value.id - instance_type_map: dict[str,str] = self.instance_type_map - if not value_id in instance_type_map: - return False - entity_type: str = instance_type_map[value_id] - return entity_type in self.entities - - - def invokes_remote_entity(self, statments: list[Statement]) -> bool: - """Returns whether statements contains a remote invocation""" - return any(s.is_remote() for s in statments) - - def split_fuction(self, G: nx.DiGraph): - """ Produces split functions. Assumes that the runtime will always return to initial function call. - Therefore functions containing a remote function call (one to a remote entity) will be split into two functions: - one function adding the keys to the stack of the remote entities to call. And the continuation which the - function returns to. This way the entity invoking the method does not know anything about - - Assumes needs split. i.e. there is a remote entity invoked. - - Every node on the path to a node included should be included. (because these are the data dependencies) - - And also the nodes that the nodes listed above are data dependend on. - - Should also contain a liveness analyses to determine which variables should be passed on to the continuation. - """ - source: Statement = self.dataflow_graph.get_source_node() - first_half = set() # A set of nodes that are in the first half of the split function. - for n in G.nodes: - n: Statement - if n == source or not n.is_remote(): - continue - elif self.no_remote_dependencies_on_path(G, source, n): - self.add_nodes_path_to_first_half(G, source, n, first_half) - continuation = set(G.nodes) - first_half # The set of nodes in the continuation. - return first_half, continuation - - - def no_remote_dependencies_on_path(self, G: nx.DiGraph, source: Statement, target: Statement) -> bool: - for path in self.get_all_simple_paths(G, source, target): - for n in path: - if n not in [source, target] and n.is_remote(): - return False - return True - - def add_nodes_path_to_first_half(self, G: nx.DiGraph, source: Statement, statement: Statement, split: set[Statement]): - for path in self.get_all_simple_paths(G, source, statement): - for n in path: - split.add(n) - - def get_all_simple_paths(self, G: nx.DiGraph, source: Statement, target: Statement): - return nx.all_simple_paths(G, source=source, target=target) - - @classmethod - def generate(cls, dataflow_graph: StatementDataflowGraph, class_name: str, entities: list[str], instance_type_map: dict[str, str]): - c = cls(dataflow_graph, class_name, entities, instance_type_map) - c.generate_split_functions() - return c.split_functions diff --git a/src/cascade/frontend/generator/local_block.py b/src/cascade/frontend/generator/local_block.py new file mode 100644 index 0000000..0400ad8 --- /dev/null +++ b/src/cascade/frontend/generator/local_block.py @@ -0,0 +1,157 @@ +from textwrap import indent +from typing import Any, Callable, Optional, Union, TYPE_CHECKING + + +from cascade.frontend.cfg import Statement +# from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState +from cascade.frontend.generator.unparser import unparse +from cascade.dataflow.dataflow import CallRemote, CallLocal, DataFlow, DataflowRef, InvokeMethod + +from klara.core.cfg import RawBasicBlock +from klara.core import nodes + +if TYPE_CHECKING: + from cascade.dataflow.operator import MethodCall, StatelessMethodCall + + +def to_entity_call(statement: Statement, type_map: dict[str, str], dataflows: dict[DataflowRef, DataFlow]) -> CallRemote: + """Transform a remote statement to an entity call.""" + writes = statement.targets + assert statement.is_remote() + assert len(writes) <= 1 + if len(writes) == 0: + assign = None + else: + assign = list(writes)[0] + + # repr includes version + operator_var, dataflow_name = repr(statement.attribute.value), statement.attribute.attr + + if operator_var in type_map: + operator_name = type_map[operator_var] + key = repr(statement.attribute.value) + else: + # assume stateless operator + operator_name = operator_var + key = None + + dataflow = DataflowRef(operator_name, dataflow_name) + + args = statement.values.copy() + args.remove(operator_var) + df_args = dataflows[dataflow].args + + return CallRemote(dataflow, {a: b for a, b in zip(df_args, args, strict=True)}, assign_result_to=assign,keyby=key) + + +class LocalBlock: + def __init__(self, statements: list[Statement], method_base_name: str, block_num: int, class_name: str, globals: Optional[dict[str, Any]]=None): + assert len(statements) > 0 + # A block of statements should have no remote calls + assert all([not s.is_remote() for s in statements]) + + self.statements: list[Statement] = statements + self.method_base_name: str = method_base_name + self.block_num: int = block_num + self.class_name: str = class_name + + writes, reads = set(), set() + for s in statements: + if type(s.block) != nodes.FunctionDef: + writes.update(t for t in s.targets) + reads.update(v for v in s.values) + + # If we assign a variable inside a function + # that means this variable can only have been assigned in this function, + # thanks to SSA. Thus we can remove it from reads, as it is local. + reads.difference_update(writes) + + # Additionally, writes with higher versions will override writes + # with lower versions. + # e.g. a_0 = 2 + # a_1 = 4 + # we want to remove a_0 from writes, as it will never be read by future + # blocks + + # writes.update + + self.reads: set[str] = reads + self.writes: set[str] = writes + self.globals = globals + + def compile(self) -> 'CompiledLocalBlock': + return CompiledLocalBlock(self) + + def compile_function(self) -> Callable: + local_scope = {} + exec(self.to_string(), self.globals, local_scope) + method_name = self.get_method_name() + return local_scope[method_name] + + def to_node(self) -> CallLocal: + return CallLocal(InvokeMethod(self.get_method_name())) + + def get_method_name(self): + return f"{self.method_base_name}_{self.block_num}" + + def to_string(self) -> str: + indent_prefix: str = ' ' * 4 # indent using 4 spaces. + body: str = indent(self.body_to_string(), indent_prefix) + method_signature: str = self.get_method_signature() + compiled_method_as_string: str = f'def {self.get_method_name()}({method_signature}):\n{body}' + return compiled_method_as_string + + def get_method_signature(self) -> str: + return f'variable_map, __state' + + def body_to_string(self) -> str: + body = [] + + # Read from the variable map + for v in sorted(self.reads - self.writes): + if v != "__state": + body.append(f'{v} = variable_map[\'{v}\']') + + # Write statements + for statement in self.statements: + block: RawBasicBlock = statement.block + if type(block) == nodes.FunctionDef: + continue + + # # TODO: do this in preprocessing + # ReplaceSelfWithState.replace(block) + + body.append(unparse(block)) + + if 'return' not in body[-1]: + # Write to the variable map + for v in sorted(self.writes - self.reads): + if not (v in [ 'self_0','self']): + body.append(f'variable_map[\'{v}\'] = {v}') + # body.append('return None') + return "\n".join(body) + + +class CompiledLocalBlock: + def __init__(self, block: LocalBlock): + self.method_base_name: str = block.method_base_name + self.block_num: int = block.block_num + self.class_name: str = block.class_name + + self.reads = block.reads + self.writes = block.writes + self.function_string = block.to_string() + self.function: Union['MethodCall', 'StatelessMethodCall'] = block.compile_function() + + def call_block(self, *args, **kwargs) -> Any: + return self.function(*args, **kwargs) + + + # def to_node(self) -> CallLocal: + # return CallLocal(InvokeMethod(self.get_method_name())) + + def get_method_name(self): + return f"{self.method_base_name}_{self.block_num}" + + # def get_method_signature(self) -> str: + # return f'variable_map, state' diff --git a/src/cascade/frontend/generator/remote_call.py b/src/cascade/frontend/generator/remote_call.py deleted file mode 100644 index 63c7601..0000000 --- a/src/cascade/frontend/generator/remote_call.py +++ /dev/null @@ -1,7 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class RemoteCall: - instance_name: str - attribute: str - target: str \ No newline at end of file diff --git a/src/cascade/frontend/generator/split_function.py b/src/cascade/frontend/generator/split_function.py deleted file mode 100644 index dcc30d0..0000000 --- a/src/cascade/frontend/generator/split_function.py +++ /dev/null @@ -1,79 +0,0 @@ -from textwrap import indent -from dataclasses import dataclass, field - - -from cascade.frontend.util import to_camel_case -from cascade.frontend.intermediate_representation import Statement -from cascade.frontend.ast_visitors.replace_name import ReplaceName -from cascade.frontend.generator.unparser import unparse -from cascade.frontend.generator.remote_call import RemoteCall - -from klara.core.cfg import RawBasicBlock -from klara.core import nodes - -@dataclass -class SplitFunction: - method_number: int - method_name: str - method_body: list[Statement] - targets: set[str] = None - values: set[str] = None - class_name: str = None - remote_calls: list[RemoteCall] = field(default_factory=list) # {'assign_result_to_var': 'method_to_call'} - - def set_class_name(self, name: str): - self.class_name = name - - def to_string(self) -> str: - indent_prefix: str = ' ' * 4 # indent usting 4 spaces. - body: str = indent(self.body_to_string(), indent_prefix) - method_signature: str = self.get_method_signature() - compiled_method_as_string: str = f'def {self.method_name}_compiled({method_signature}) -> Any:\n{body}' - return compiled_method_as_string - - def get_method_signature(self) -> str: - return f'variable_map: dict[str, Any], state: {self.class_name}, key_stack: list[str]' - - def body_to_string(self) -> str: - body = [] - for v in sorted(self.values - self.targets): - if not (v in [ 'self_0','self']): - body.append(f'{v} = variable_map[\'{v}\']') - - for statement in self.method_body: - if statement.remote_call: - assert statement.attribute - attribute: nodes.Attribute = statement.attribute - value: nodes.Name = attribute.value - instance_name: str = value.id - res = f'key_stack.append(variable_map[\'{instance_name}_key\'])' - body.append(res) - else: - block: RawBasicBlock = statement.block - if type(block) == nodes.FunctionDef: - continue - ReplaceName.replace(block, 'self', 'state') - - if type(block) == nodes.Return: - body.insert(0,'key_stack.pop()') - body.append(unparse(block)) - - if 'return' not in body[-1]: - body.append('return None') - return "\n".join(body) - - def extract_remote_method_calls(self): - for statement in self.method_body: - if statement.remote_call: - self.add_statement_to_remote_call_set(statement) - - def add_statement_to_remote_call_set(self, statement: Statement): - assert statement.attribute, "A remote call should have an attribute name to call" - attribute = statement.attribute - if len(statement.targets) > 1: - assert False, "A remote method invocation that returns multiple items is not supported yet..." - target, = statement.targets - remote_call: RemoteCall = RemoteCall(attribute.value.id, attribute.attr, target) - self.remote_calls.append(remote_call) - - diff --git a/src/cascade/frontend/generator/unparser.py b/src/cascade/frontend/generator/unparser.py index a4a8677..1e677e6 100644 --- a/src/cascade/frontend/generator/unparser.py +++ b/src/cascade/frontend/generator/unparser.py @@ -2,22 +2,61 @@ from klara.core import nodes -from cascade.frontend. intermediate_representation import Statement - def unparse(block: RawBasicBlock): match type(block): case nodes.Return: return f'return {unparse(block.value)}' case nodes.AugAssign: - return f'{unparse(block.target)} {block.op}= {unparse(block.value)}' + raise NotImplementedError() + # TODO: augassign does not work well with ssa + # e.g. + # a = 1 + # a += 2 + # will generate: + # a_0 = 1 + # a_1 += 2 + # The last line should be desugared into + # a_1 = a_0 + 2 (perhapse with a n Ast.Visitor?) + return f'{repr(block.target)} {block.op}= {unparse(block.value)}' case nodes.Assign: target, *rest = block.targets - return f'{repr(target)} = {unparse(block.value)}' + return f'{unparse(target)} = {unparse(block.value)}' case nodes.Attribute: return f'{block.value}.{block.attr}' + case nodes.AssignName: + return repr(block) case nodes.Name: return repr(block) case nodes.BinOp: return f'{unparse(block.left)} {block.op} {unparse(block.right)}' - case _: + case nodes.Subscript: + return str(block) + case nodes.Const: return str(block) + case nodes.NameConstant: + return str(block) + case nodes.Compare: + res = unparse(block.left) + for op, operand in zip(block.ops, block.comparators): + res += " {} {}".format(op, unparse(operand)) + return res + case nodes.Bool: + return repr(block) + case nodes.If: + print(block.test, block.body, block.orelse) + raise NotImplementedError(type(block), "Should have been removed in previous CFG pass") + case nodes.FunctionDef: + return str(block).replace('"', "'") + case nodes.Call: + return "{}{}".format(str(block.func), tuple(block.args)) + case nodes.UnaryOp: + return "{}{}".format(str(block.op), unparse(block.operand)) + case nodes.Expr: + return unparse(block.value) + case nodes.BoolOp: + res = unparse(block.values[0]) + for v in block.values[1:]: + res += " {} {}".format(block.op, unparse(v)) + return res + case _: + raise NotImplementedError(f"{type(block)}: {block}") diff --git a/src/cascade/frontend/intermediate_representation/__init__.py b/src/cascade/frontend/intermediate_representation/__init__.py deleted file mode 100644 index ddb00f3..0000000 --- a/src/cascade/frontend/intermediate_representation/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .statement import Statement -from .statement_level_dataflow_graph import StatementDataflowGraph \ No newline at end of file diff --git a/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py b/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py deleted file mode 100644 index e495d89..0000000 --- a/src/cascade/frontend/intermediate_representation/statement_level_dataflow_graph.py +++ /dev/null @@ -1,21 +0,0 @@ -from dataclasses import dataclass -import networkx as nx - - -@dataclass -class StatementDataflowGraph: - """ Statement level dataflow graph. Capturs statement level data dependencies in a nx.DiGraph. - The nodes of the graph are Statements - """ - graph: nx.DiGraph - instance_type_map: dict[str, str] = None # {"instance_name": "EntityType"} - method_name: str = None - - def set_name(self, name: str): - self.name = name - - def get_nodes(self): - return self.graph.nodes - - def get_source_node(self): - return next(iter(self.get_nodes())) diff --git a/src/cascade/frontend/util.py b/src/cascade/frontend/util.py index 22f10e3..0f3d29d 100644 --- a/src/cascade/frontend/util.py +++ b/src/cascade/frontend/util.py @@ -5,6 +5,8 @@ from klara.core.tree_rewriter import AstBuilder from klara.core.cfg import Cfg +from cascade.frontend.ast_visitors.simplify_returns import simplify_returns + color_map_map = {0: 'b', 1:'g', 2:'r', 3:'c', 4:'m', 5:'y', 6:'k', -1:'pink'} @@ -41,12 +43,6 @@ def plot_dataflow_graph(G: nx.DiGraph, grey_background: bool = True): if grey_background: fig.set_facecolor('darkgrey') -def setup_cfg(code: str) -> Cfg: - as_tree = AstBuilder().string_build(code) - cfg = Cfg(as_tree) - cfg.convert_to_ssa() - return cfg - def to_camel_case(name): return re.sub(r'(? tuple[Cfg, nodes.Module]: + as_tree = AstBuilder().string_build(code) + cfg = Cfg(as_tree) + cfg.convert_to_ssa() + if preprocess: + ReplaceSelfWithState.replace(as_tree) + simplify_returns(as_tree) + # TODO: do this in preprocessing + return cfg, as_tree \ No newline at end of file diff --git a/src/cascade/runtime/flink_runtime.py b/src/cascade/runtime/flink_runtime.py index 5afd53f..66faba6 100644 --- a/src/cascade/runtime/flink_runtime.py +++ b/src/cascade/runtime/flink_runtime.py @@ -11,14 +11,15 @@ from pyflink.datastream.functions import KeyedProcessFunction, RuntimeContext, ValueState, ValueStateDescriptor from pyflink.datastream.connectors.kafka import KafkaOffsetsInitializer, KafkaRecordSerializationSchema, KafkaSource, KafkaSink from pyflink.datastream import ProcessFunction, StreamExecutionEnvironment +from pyflink.datastream.output_tag import OutputTag import pickle -from cascade.dataflow.dataflow import CollectNode, CollectTarget, Event, EventResult, Filter, InitClass, InvokeMethod, Node, OpNode, SelectAllNode, StatelessOpNode +from cascade.dataflow.dataflow import CallLocal, CollectNode, DataFlow, DataflowRef, Event, EventResult, InitClass, InvokeMethod, Node from cascade.dataflow.operator import StatefulOperator, StatelessOperator from confluent_kafka import Producer, Consumer import logging -logger = logging.getLogger(__name__) -logger.setLevel(1) +logger = logging.getLogger("cascade") +logger.setLevel("INFO") console_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) @@ -27,6 +28,12 @@ # Required if SelectAll nodes are used SELECT_ALL_ENABLED = False +# Add profiling information to metadata +PROFILE = False + +# Enable latency metrics +METRICS = False + @dataclass class FlinkRegisterKeyNode(Node): """A node that will register a key with the SelectAll operator. @@ -43,6 +50,68 @@ def propogate(self, event: Event, targets: list[Node], result: Any, **kwargs) -> """A key registration event does not propogate.""" return [] +class FanOutOperator(ProcessFunction): + """""" + def __init__(self, stateful_ops: dict[str, OutputTag], stateless_ops: dict[str, OutputTag]) -> None: + self.stateful_ops = stateful_ops + self.stateless_ops = stateless_ops + + def process_element(self, event: Event, ctx: ProcessFunction.Context): + event = profile_event(event, "FanOut") + + logger.debug(f"FanOut Event entered: {event._id}") + + if isinstance(event.target, CallLocal): + if event.dataflow.operator_name in self.stateful_ops: + tag = self.stateful_ops[event.dataflow.operator_name] + else: + tag = self.stateless_ops[event.dataflow.operator_name] + + else: + logger.error(f"FanOut: Wrong target: {event}") + return + + logger.debug(f"Fanout Event routed to: {tag.tag_id}") + yield tag, event + +class RouterOperator(ProcessFunction): + """Takes in an Event and Result as tuple. Calls Event.propogate on the event. + + The main output contains Events to be reingested into the system. + There are two side outputs: + - one for Events with a CollectNode target + - one for EventResults + """ + def __init__(self, dataflows: dict['DataflowRef', 'DataFlow'], collect_tag: OutputTag, out_tag: OutputTag) -> None: + self.dataflows = dataflows + self.collect_tag = collect_tag + self.out_tag = out_tag + + def process_element(self, event_result: tuple[Event, Any], ctx: ProcessFunction.Context): + event, result = event_result + event = profile_event(event, "Router") + + logger.debug(f"RouterOperator Event entered: {event}") + + new_events = list(event.propogate(result, self.dataflows)) + + if len(new_events) == 1 and isinstance(new_events[0], EventResult): + logger.debug(f"RouterOperator: Returned {new_events[0]}") + else: + logger.debug(f"RouterOperator: Propogated {len(new_events)} new Events") + for i, event in enumerate(new_events): + logger.debug(f"{event} {i+1}/{len(new_events)}") + + for event in new_events: + if isinstance(event, Event): + if isinstance(event.target, CollectNode): + yield self.collect_tag, event + else: + yield event + else: + assert isinstance(event, EventResult) + yield self.out_tag, event + class FlinkOperator(KeyedProcessFunction): """Wraps an `cascade.dataflow.datflow.StatefulOperator` in a KeyedProcessFunction so that it can run in Flink. """ @@ -56,19 +125,18 @@ def open(self, runtime_context: RuntimeContext): self.state: ValueState = runtime_context.get_state(descriptor) def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + event = profile_event(event, "STATEFUL OP INNER ENTRY") # should be handled by filters on this FlinkOperator - assert(isinstance(event.target, OpNode)) - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Processing: {event.target.method_type}") + assert(isinstance(event.target, CallLocal)) + logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Processing: {event.target.method}") - assert(event.target.entity == self.operator.entity) + assert(event.dataflow.operator_name == self.operator.name()) key = ctx.get_current_key() assert(key is not None) - if isinstance(event.target.method_type, InitClass): - # TODO: compile __init__ with only kwargs, and pass the variable_map itself - # otherwise, order of variable_map matters for variable assignment - result = self.operator.handle_init_class(*event.variable_map.values()) + if isinstance(event.target.method, InitClass): + result = self.operator.handle_init_class(**event.variable_map).__dict__ # Register the created key in FlinkSelectAllOperator if SELECT_ALL_ENABLED: @@ -78,19 +146,22 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): None, _id = event._id ) - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Registering key: {register_key_event}") + logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Registering key: {register_key_event}") yield register_key_event self.state.update(pickle.dumps(result)) - elif isinstance(event.target.method_type, InvokeMethod): + + elif isinstance(event.target.method, InvokeMethod): state = self.state.value() if state is None: - # try to create the state if we haven't been init'ed - state = self.operator.handle_init_class(*event.variable_map.values()) + logger.error(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: State does not exist for key {ctx.get_current_key()}") + # raise KeyError(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: State does not exist for key {ctx.get_current_key()}") + # try to create it anyway + state = self.operator.handle_init_class(*event.variable_map).__dict__ else: state = pickle.loads(state) - result = self.operator.handle_invoke_method(event.target.method_type, variable_map=event.variable_map, state=state) + result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map, state=state) # TODO: check if state actually needs to be updated if state is not None: @@ -103,16 +174,18 @@ def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): # return # result = event.key_stack[-1] - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result + # if event.target.assign_result_to is not None: + # event.variable_map[event.target.assign_result_to] = result - new_events = event.propogate(result) - if isinstance(new_events, EventResult): - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Returned {new_events}") - yield new_events - else: - logger.debug(f"FlinkOperator {self.operator.entity.__name__}[{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") - yield from new_events + # new_events = list(event.propogate(result, self.operator.dataflows)) + + # if len(new_events) == 1 and isinstance(new_events[0], EventResult): + # logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Returned {new_events[0]}") + # else: + # logger.debug(f"FlinkOperator {self.operator.name()}[{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") + + # yield from new_events + yield (event, result) class FlinkStatelessOperator(ProcessFunction): """Wraps an `cascade.dataflow.datflow.StatefulOperator` in a KeyedProcessFunction so that it can run in Flink. @@ -122,27 +195,28 @@ def __init__(self, operator: StatelessOperator) -> None: self.operator = operator - def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): + def process_element(self, event: Event, ctx: ProcessFunction.Context): + event = profile_event(event, "STATELESS OP INNER ENTRY") - # should be handled by filters on this FlinkOperator - assert(isinstance(event.target, StatelessOpNode)) + assert isinstance(event.target, CallLocal) - logger.debug(f"FlinkStatelessOperator {self.operator.dataflow.name}[{event._id}]: Processing: {event.target.method_type}") - if isinstance(event.target.method_type, InvokeMethod): - result = self.operator.handle_invoke_method(event.target.method_type, variable_map=event.variable_map) + logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Processing: {event.target.method}") + if isinstance(event.target.method, InvokeMethod): + result = self.operator.handle_invoke_method(event.target.method, variable_map=event.variable_map) else: - raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method_type}") + raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method}") - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result - new_events = event.propogate(result) - if isinstance(new_events, EventResult): - logger.debug(f"FlinkStatelessOperator {self.operator.dataflow.name}[{event._id}]: Returned {new_events}") - yield new_events - else: - logger.debug(f"FlinkStatelessOperator {self.operator.dataflow.name}[{event._id}]: Propogated {len(new_events)} new Events") - yield from new_events + # new_events = list(event.propogate(result, self.operator.dataflows)) + + # if len(new_events) == 1 and isinstance(new_events[0], EventResult): + # logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Returned {new_events[0]}") + # else: + # logger.debug(f"FlinkStatelessOperator {self.operator.name()}[{event._id}]: Propogated {len(new_events)} new Events") + + # yield from new_events + yield (event, result) + class FlinkSelectAllOperator(KeyedProcessFunction): """A process function that yields all keys of a certain class""" @@ -181,21 +255,6 @@ def process_element(self, event: Event, ctx: 'ProcessFunction.Context'): else: raise Exception(f"Unexpected target for SelectAllOperator: {event.target}") -class Result(ABC): - """A `Result` can be either `Arrived` or `NotArrived`. It is used in the - FlinkCollectOperator to determine whether all the events have completed - their computation.""" - pass - -@dataclass -class Arrived(Result): - val: Any - -@dataclass -class NotArrived(Result): - pass - - class FlinkCollectOperator(KeyedProcessFunction): """Flink implementation of a merge operator.""" def __init__(self): @@ -203,47 +262,39 @@ def __init__(self): def open(self, runtime_context: RuntimeContext): descriptor = ValueStateDescriptor("merge_state", Types.PICKLED_BYTE_ARRAY()) - self.collection = runtime_context.get_state(descriptor) + self.var_map = runtime_context.get_state(descriptor) def process_element(self, event: Event, ctx: KeyedProcessFunction.Context): - collection: list[Result] = self.collection.value() + event = profile_event(event, "COLLECT OP INNER ENTRY") + + var_map_num_items = self.var_map.value() logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processing: {event}") + + assert isinstance(event.target, CollectNode) - # for now we assume there is only 1 merge per df - assert event.collect_target is not None - entry: CollectTarget = event.collect_target - target_node: CollectNode = entry.target_node + total_events = event.target.num_events # Add to the map - if collection == None: + if var_map_num_items == None: logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Creating map") - collection = [NotArrived()] * entry.total_items - logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Processed event {entry.result_idx} ({entry.total_items})") - - result = None - try: - result = event.variable_map[target_node.read_results_from] - except KeyError: - pass - - collection[entry.result_idx] = Arrived(result) - self.collection.update(collection) + combined_var_map = {} + num_items = 0 + else: + combined_var_map, num_items = var_map_num_items + + combined_var_map.update(event.variable_map) + num_items += 1 + logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Recieved {num_items}/{total_events} Events") + - # Yield events if the merge is done - if all([isinstance(r, Arrived) for r in collection]): + if num_items == total_events: logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Yielding collection") - - collection = [r.val for r in collection if r.val is not None] # type: ignore (r is of type Arrived) - event.variable_map[target_node.assign_result_to] = collection - new_events = event.propogate(collection) - - self.collection.clear() - if isinstance(new_events, EventResult): - logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Returned {new_events}") - yield new_events - else: - logger.debug(f"FlinkCollectOp [{ctx.get_current_key()}]: Propogated {len(new_events)} new Events") - yield from new_events + event.variable_map = combined_var_map + # yield from event.propogate(None) + yield (event, None) + self.var_map.clear() + else: + self.var_map.update((combined_var_map, num_items)) class ByteSerializer(SerializationSchema, DeserializationSchema): @@ -307,6 +358,15 @@ def timestamp_event(e: Event) -> Event: pass return e +def profile_event(e: Event, ts_name: str) -> Event: + if not PROFILE: + return e + t1 = time.time() + if "prof" not in e.metadata: + e.metadata["prof"] = [] + e.metadata["prof"].append((ts_name, t1)) + return e + def timestamp_result(e: EventResult) -> EventResult: t1 = time.time() e.metadata["out_t"] = t1 @@ -344,7 +404,14 @@ def __init__(self, input_topic="input-topic", output_topic="output-topic", ui_po Warning that this does not work well with run(collect=True)!""" - def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, parallelism=None): + self.stateless_operators: list[FlinkStatelessOperator] = [] + self.stateful_operators: list[FlinkOperator] = [] + """List of stateful operator streams, which gets appended at `add_operator`.""" + + self.dataflows: dict['DataflowRef', 'DataFlow'] = {} + + + def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, parallelism=None, thread_mode=False): """Initialise & configure the Flink runtime. This function is required before any other calls, and requires a Kafka @@ -370,10 +437,26 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para config.set_string("rest.port", str(self.ui_port)) config.set_integer("python.fn-execution.bundle.time", bundle_time) config.set_integer("python.fn-execution.bundle.size", bundle_size) + + # Thread mode has significant performance impacts, see + # https://flink.apache.org/2022/05/06/exploring-the-thread-mode-in-pyflink/ + # In short: + # much faster single threaded python performance + # GIL becomes an issue if running higher parallelism on the same taskmanager + # can't use with minicluster (e.g. while testing) + if thread_mode: + config.set_string("python.execution-mode", "thread") + + # METRICS + if METRICS: + config.set_boolean("python.metric.enabled", True) + config.set_string("metrics.latency.interval", "500 ms") + config.set_boolean("state.latency-track.keyed-state-enabled", True) + config.set_boolean("taskmanager.network.detailed-metrics", True) # optimize for low latency - # config.set_integer("taskmanager.memory.managed.size", 0) - config.set_integer("execution.buffer-timeout", 5) + config.set_string("execution.batch-shuffle-mode", "ALL_EXCHANGES_PIPELINED") + config.set_string("execution.buffer-timeout", "0 ms") kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__)), @@ -387,7 +470,9 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para self.env = StreamExecutionEnvironment.get_execution_environment(config) if parallelism: self.env.set_parallelism(parallelism) - logger.debug(f"FlinkRuntime: parellelism {self.env.get_parallelism()}") + parallelism = self.env.get_parallelism() + + logger.info(f"FlinkRuntime: parallelism {parallelism}") deserialization_schema = ByteSerializer() @@ -408,11 +493,16 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para .set_group_id("test_group_1") .set_starting_offsets(KafkaOffsetsInitializer.earliest()) .set_value_only_deserializer(deserialization_schema) + .set_property("fetch.min.bytes", "1") + .set_property("max.partition.fetch.bytes", "1048576") + .set_property("enable.auto.commit", "false") .build() ) self.kafka_internal_sink = ( KafkaSink.builder() .set_bootstrap_servers(kafka_broker) + .set_property("linger.ms", "0") + .set_property("acks", "1") .set_record_serializer( KafkaRecordSerializationSchema.builder() .set_topic(self.internal_topic) @@ -443,17 +533,18 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para "Kafka External Source" ) .map(lambda x: deserialize_and_timestamp(x)) + .set_parallelism(parallelism=max(parallelism//4, 1)) .name("DESERIALIZE external") # .filter(lambda e: isinstance(e, Event)) # Enforced by `send` type safety ).union( self.env.from_source( kafka_internal_source, WatermarkStrategy.no_watermarks(), - "Kafka External Source" + "Kafka Internal Source" ) .map(lambda x: deserialize_and_timestamp(x)) .name("DESERIALIZE internal") - ) + )#.map(lambda e: profile_event(e, "DESERIALIZE DONE")) # Events with a `SelectAllNode` will first be processed by the select # all operator, which will send out multiple other Events that can @@ -473,13 +564,11 @@ def init(self, kafka_broker="localhost:9092", bundle_time=1, bundle_size=5, para event_stream = select_all_stream.union(not_select_all_stream) - self.stateful_op_stream = event_stream - self.stateless_op_stream = event_stream - + # # event_stream = event_stream.disable_chaining() + # self.stateful_op_stream = event_stream + # self.stateless_op_stream = event_stream + self.event_stream = event_stream - self.stateless_op_streams = [] - self.stateful_op_streams = [] - """List of stateful operator streams, which gets appended at `add_operator`.""" logger.debug("FlinkRuntime initialized") @@ -487,25 +576,20 @@ def add_operator(self, op: StatefulOperator): """Add a `FlinkOperator` to the Flink datastream.""" flink_op = FlinkOperator(op) - op_stream = ( - self.stateful_op_stream.filter(lambda e: isinstance(e.target, OpNode) and e.target.entity == flink_op.operator.entity) - .key_by(lambda e: e.variable_map[e.target.read_key_from]) - .process(flink_op) - .name("STATEFUL OP: " + flink_op.operator.entity.__name__) - ) - self.stateful_op_streams.append(op_stream) + self.stateful_operators.append(flink_op) + self.dataflows.update(op.dataflows) def add_stateless_operator(self, op: StatelessOperator): """Add a `FlinkStatelessOperator` to the Flink datastream.""" flink_op = FlinkStatelessOperator(op) - op_stream = ( - self.stateless_op_stream - .filter(lambda e: isinstance(e.target, StatelessOpNode) and e.target.operator.dataflow.name == flink_op.operator.dataflow.name) - .process(flink_op) - .name("STATELESS DATAFLOW: " + flink_op.operator.dataflow.name) - ) - self.stateless_op_streams.append(op_stream) + self.stateless_operators.append(flink_op) + self.dataflows.update(op.dataflows) + + def add_dataflow(self, dataflow: DataFlow): + """When adding extra dataflows, e.g. when testing or for optimized versions""" + self.dataflows[dataflow.ref()] = dataflow + def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="kafka") -> Union[CloseableIterator, None]: """Start ingesting and processing messages from the Kafka source. @@ -514,35 +598,77 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka `cascade.dataflow.dataflow.EventResult`s.""" assert self.env is not None, "FlinkRuntime must first be initialised with `init()`." - logger.debug("FlinkRuntime merging operator streams...") + logger.info("FlinkRuntime merging operator streams...") + + # create the fanout operator + stateful_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateful_operators} + stateless_tags = { op.operator.name() : OutputTag(op.operator.name()) for op in self.stateless_operators} + collect_tag = OutputTag("__COLLECT__") + result_tag = OutputTag("__EVENT_RESULT__") + logger.debug(f"Stateful tags: {stateful_tags.items()}") + logger.debug(f"Stateless tags: {stateless_tags.items()}") + fanout = self.event_stream.process(FanOutOperator(stateful_tags, stateless_tags)).name("FANOUT OPERATOR")#.disable_chaining() + + # create the streams + self.stateful_op_streams = [] + for flink_op in self.stateful_operators: + tag = stateful_tags[flink_op.operator.name()] + op_stream = ( + fanout + .get_side_output(tag) + .key_by(lambda e: e.key) + .process(flink_op) + .name("STATEFUL OP: " + flink_op.operator.name()) + ) + self.stateful_op_streams.append(op_stream) + + self.stateless_op_streams = [] + for flink_op in self.stateless_operators: + tag = stateless_tags[flink_op.operator.name()] + op_stream = ( + fanout + .get_side_output(tag) + .process(flink_op) + .name("STATELESS OP: " + flink_op.operator.name()) + ) + self.stateless_op_streams.append(op_stream) # Combine all the operator streams if len(self.stateful_op_streams) >= 1: s1 = self.stateful_op_streams[0] rest = self.stateful_op_streams[1:] - operator_streams = s1.union(*rest, *self.stateless_op_streams) + operator_streams = s1.union(*rest, *self.stateless_op_streams)#.map(lambda e: profile_event(e, "OP STREAM UNION")) elif len(self.stateless_op_streams) >= 1: s1 = self.stateless_op_streams[0] rest = self.stateless_op_streams[1:] - operator_streams = s1.union(*rest, *self.stateful_op_streams) + operator_streams = s1.union(*rest, *self.stateful_op_streams)#.map(lambda e: profile_event(e, "OP STREAM UNION")) else: raise RuntimeError("No operators found, were they added to the flink runtime with .add_*_operator()") - merge_op_stream = ( - operator_streams.filter(lambda e: isinstance(e, Event) and isinstance(e.target, CollectNode)) - .key_by(lambda e: e._id) # might not work in the future if we have multiple merges in one dataflow? + + op_routed = operator_streams.process(RouterOperator(self.dataflows, collect_tag, result_tag)).name("ROUTER (OP)") + + collect_stream = ( + op_routed + .get_side_output(collect_tag) + .key_by(lambda e: str(e._id) + "_" + str(e.target.id)) # might not work in the future if we have multiple merges in one dataflow? .process(FlinkCollectOperator()) .name("Collect") + .process(RouterOperator(self.dataflows, collect_tag, result_tag)) ) """Stream that ingests events with an `cascade.dataflow.dataflow.CollectNode` target""" + # descriptor = ValueStateDescriptor("dataflows", Types.PICKLED_BYTE_ARRAY()) + + # broadcast_dataflows = self.env.broadcast_variable("dataflows", list(self.dataflows.items())) # union with EventResults or Events that don't have a CollectNode target - ds = merge_op_stream.union(operator_streams.filter(lambda e: not (isinstance(e, Event) and isinstance(e.target, CollectNode)))) + # Output the stream results = ( - ds - .filter(lambda e: isinstance(e, EventResult)) + op_routed.get_side_output(result_tag).union(collect_stream.get_side_output(result_tag)) + # .filter(lambda e: isinstance(e, EventResult)) + # .map(lambda e: profile_event(e, "EXTERNAL SINK")) .map(lambda e: timestamp_result(e)) ) if output == "collect": @@ -554,15 +680,26 @@ def run(self, run_async=False, output: Literal["collect", "kafka", "stdout"]="ka else: raise ValueError(f"Invalid output: {output}") - ds_internal = ds.filter(lambda e: isinstance(e, Event)).map(lambda e: timestamp_event(e)).sink_to(self.kafka_internal_sink).name("INTERNAL KAFKA SINK") + ds_internal = ( + op_routed.union(collect_stream) + # .filter(lambda e: isinstance(e, Event)) + # .map(lambda e: profile_event(e, "INTERNAL SINK")) + .map(lambda e: timestamp_event(e)) + .sink_to(self.kafka_internal_sink) + .name("INTERNAL KAFKA SINK") + ) if run_async: - logger.debug("FlinkRuntime starting (async)") + logger.info("FlinkRuntime starting (async)") self.env.execute_async("Cascade: Flink Runtime") return ds_external # type: ignore (will be CloseableIterator provided the source is unbounded (i.e. Kafka)) else: - logger.debug("FlinkRuntime starting (sync)") + logger.info("FlinkRuntime starting (sync)") self.env.execute("Cascade: Flink Runtime") + + def close(self): + assert self.env is not None, "FlinkRuntime must first be initialised with `init()`." + self.env.close() class FlinkClientSync: def __init__(self, input_topic="input-topic", output_topic="output-topic", kafka_url="localhost:9092", start_consumer_thread: bool = True): @@ -618,17 +755,23 @@ def consume_results(self): def flush(self): self.producer.flush() - def send(self, event: Union[Event, list[Event]], flush=False) -> int: + def send(self, event: Union[Event, list[Event]], flush=False, block=False) -> int: if isinstance(event, list): for e in event: id = self._send(e) else: id = self._send(event) - if flush: + if flush or block: self.producer.flush() - return id + if block: + while (r := self._futures[id]["ret"]) == None: + time.sleep(0.1) + + return r.result + else: + return id def _send(self, event: Event) -> int: diff --git a/src/cascade/runtime/python_runtime.py b/src/cascade/runtime/python_runtime.py index a955e9c..0f14bc0 100644 --- a/src/cascade/runtime/python_runtime.py +++ b/src/cascade/runtime/python_runtime.py @@ -1,43 +1,38 @@ -from logging import Filter import threading -from typing import Type +from typing import List, Union +import cascade from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.dataflow.dataflow import CollectNode, Event, EventResult, InitClass, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode +from cascade.dataflow.dataflow import CallRemote, CallLocal, CollectNode, Event, EventResult, InitClass, InvokeMethod from queue import Empty, Queue +import time + class PythonStatefulOperator(): def __init__(self, operator: StatefulOperator): self.operator = operator self.states = {} def process(self, event: Event): - assert(isinstance(event.target, OpNode)) - assert(event.target.entity == self.operator.entity) + assert(isinstance(event.target, CallLocal)) - key = event.variable_map[event.target.read_key_from] + key = event.key print(f"PythonStatefulOperator[{self.operator.entity.__name__}[{key}]]: {event}") - if isinstance(event.target.method_type, InitClass): - result = self.operator.handle_init_class(*event.variable_map.values()) + if isinstance(event.target.method, InitClass): + result = self.operator.handle_init_class(*event.variable_map.values()).__dict__ self.states[key] = result - elif isinstance(event.target.method_type, InvokeMethod): + elif isinstance(event.target.method, InvokeMethod): state = self.states[key] result = self.operator.handle_invoke_method( - event.target.method_type, + event.target.method, variable_map=event.variable_map, state=state, ) self.states[key] = state - - elif isinstance(event.target.method_type, Filter): - raise NotImplementedError() - - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result - - new_events = event.propogate(result) + + new_events = event.propogate(result, cascade.core.dataflows) if isinstance(new_events, EventResult): yield new_events else: @@ -48,27 +43,52 @@ def __init__(self, operator: StatelessOperator): self.operator = operator def process(self, event: Event): - assert(isinstance(event.target, StatelessOpNode)) + assert(isinstance(event.target, CallLocal)) - print(f"PythonStatelessOperator[{self.operator.dataflow.name}]: {event}") + print(f"PythonStatelessOperator[{self.operator.name()}]: {event}") - if isinstance(event.target.method_type, InvokeMethod): + if isinstance(event.target.method, InvokeMethod): result = self.operator.handle_invoke_method( - event.target.method_type, + event.target.method, variable_map=event.variable_map, ) else: - raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method_type}") + raise Exception(f"A StatelessOperator cannot compute event type: {event.target.method}") - if event.target.assign_result_to is not None: - event.variable_map[event.target.assign_result_to] = result - - new_events = event.propogate(result) + new_events = event.propogate(result, cascade.core.dataflows) if isinstance(new_events, EventResult): yield new_events else: yield from new_events +class PythonCollectOperator(): + def __init__(self): + self.state = {} + + def process(self, event: Event): + key = event._id + if key not in self.state: + self.state[key] = [event] + else: + self.state[key].append(event) + + assert isinstance(event.target, CollectNode) + n = event.target.num_events + print(f"PythonCollectOperator: collected {len(self.state[key])}/{n} for event {event._id}") + + if len(self.state[key]) == n: + var_map = {} + for event in self.state[key]: + var_map.update(event.variable_map) + + event.variable_map = var_map + new_events = event.propogate(None, cascade.core.dataflows) + if isinstance(new_events, EventResult): + yield new_events + else: + yield from new_events + + class PythonRuntime(): """Simple non-distributed runtime meant for testing that runs Dataflows locally.""" @@ -76,24 +96,34 @@ def __init__(self): self.events = Queue() self.results = Queue() self.running = False - self.statefuloperators: dict[Type, PythonStatefulOperator] = {} + self.statefuloperators: dict[str, PythonStatefulOperator] = {} self.statelessoperators: dict[str, PythonStatelessOperator] = {} + self.collect = PythonCollectOperator() def init(self): pass def _consume_events(self): + try: + self._run() + except Exception as e: + self.running = False + raise e + + def _run(self): self.running = True def consume_event(event: Event): - if isinstance(event.target, OpNode): - yield from self.statefuloperators[event.target.entity].process(event) - elif isinstance(event.target, StatelessOpNode): - yield from self.statelessoperators[event.target.operator.dataflow.name].process(event) + if isinstance(event.target, CallLocal): + if event.dataflow.operator_name in self.statefuloperators: + yield from self.statefuloperators[event.dataflow.operator_name].process(event) + else: + yield from self.statelessoperators[event.dataflow.operator_name].process(event) - elif isinstance(event.target, SelectAllNode): - raise NotImplementedError() elif isinstance(event.target, CollectNode): - raise NotImplementedError() + yield from self.collect.process(event) + + else: + raise ValueError(f"Event target type can only be CallLocal or CollectNode, not {event.target}") events = [] @@ -108,18 +138,17 @@ def consume_event(event: Event): for ev in consume_event(event): if isinstance(ev, EventResult): - print(ev) self.results.put(ev) elif isinstance(ev, Event): events.append(ev) def add_operator(self, op: StatefulOperator): """Add a `StatefulOperator` to the datastream.""" - self.statefuloperators[op.entity] = PythonStatefulOperator(op) + self.statefuloperators[op.name()] = PythonStatefulOperator(op) def add_stateless_operator(self, op: StatelessOperator): """Add a `StatelessOperator` to the datastream.""" - self.statelessoperators[op.dataflow.name] = PythonStatelessOperator(op) + self.statelessoperators[op.name()] = PythonStatelessOperator(op) def send(self, event: Event, flush=None): self.events.put(event) @@ -137,13 +166,23 @@ def __init__(self, runtime: PythonRuntime): self._results_q = runtime.results self._events = runtime.events self.results = {} + self.runtime = runtime - def send(self, event: Event, block=True): - self._events.put(event) - - while block: - er: EventResult = self._results_q.get(block=True) - if event._id == er.event_id: + def send(self, event: Union[Event, List[Event]], block=True): + if isinstance(event, list): + for e in event: + self._events.put(e) + id = e._id + else: + self._events.put(event) + id = event._id + + while block and self.runtime.running: + try: + er: EventResult = self._results_q.get(block=False, timeout=0.1) + except Empty: + continue + if id == er.event_id: self.results[er.event_id] = er.result return er.result diff --git a/test_programs/__init__.py b/test_programs/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_programs/expected/__init__.py b/test_programs/expected/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_programs/expected/checkout_item.py b/test_programs/expected/checkout_item.py deleted file mode 100644 index 75a32fa..0000000 --- a/test_programs/expected/checkout_item.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Any - -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from test_programs.target.checkout_item import User, Item - -def buy_item_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - - -def buy_item_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - item_price_0 = variable_map['item_price_0'] - state.balance -= item_price_0 - return state.balance >= 0 - - -def get_price_0_compiled(variable_map: dict[str, Any], state: Item) -> Any: - return state.price - - -def user_buy_item_df(): - df = DataFlow("user.buy_item") - n0 = OpNode(User, InvokeMethod("buy_item_0"), read_key_from="user_key") - n1 = OpNode(Item, InvokeMethod("get_price"), assign_result_to="item_price", read_key_from="item_key") - n2 = OpNode(User, InvokeMethod("buy_item_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.entry = n0 - return df - diff --git a/test_programs/expected/checkout_two_items.py b/test_programs/expected/checkout_two_items.py deleted file mode 100644 index 9849ad5..0000000 --- a/test_programs/expected/checkout_two_items.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, OpNode, InvokeMethod, Edge -from cascade.dataflow.operator import StatefulOperator -from test_programs.target.checkout_two_items import User, Item - -def buy_two_items_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - -def buy_two_items_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - item_price_1_0 = variable_map['item_price_1_0'] - item_price_2_0 = variable_map['item_price_2_0'] - total_price_0 = item_price_1_0 + item_price_2_0 - state.balance -= total_price_0 - return state.balance >= 0 - -def get_price_0_compiled(variable_map: dict[str, Any], state: Item) -> Any: - return state.price - - -user_op = StatefulOperator( - User, - { - "buy_two_items_0": buy_two_items_0_compiled, - "buy_two_items_1": buy_two_items_1_compiled - }, - None) - -item_op = StatefulOperator( - Item, {"get_price": get_price_0_compiled}, None -) - -def user_buy_two_items_df(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(User, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n1 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price_1", - read_key_from="item1_key" - ) - n2 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price_2", - read_key_from="item1_key" - ) - n3 = OpNode(User, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - df.entry = n0 - return df - - -# For future optimizations (not used) -def user_buy_two_items_df_parallelized(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(User, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n3 = CollectNode(assign_result_to="item_prices", read_results_from="item_price") - n1 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 0), - read_key_from="item1_key" - ) - n2 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 1), - read_key_from="item1_key" - ) - n4 = OpNode(User, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n3)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.entry = n0 - return df - -user_op.dataflows = { - "buy_two_items": user_buy_two_items_df(), -} \ No newline at end of file diff --git a/test_programs/expected/deathstar_recommendation.py b/test_programs/expected/deathstar_recommendation.py deleted file mode 100644 index 8a8a727..0000000 --- a/test_programs/expected/deathstar_recommendation.py +++ /dev/null @@ -1,112 +0,0 @@ -from typing import Any, Literal -from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator - - -def get_recs_if_cond(variable_map: dict[str, Any]): - return variable_map["requirement"] == "distance" - -# list comprehension entry -def get_recs_if_body_0(variable_map: dict[str, Any]): - pass - - -# list comprehension body -def get_recs_if_body_1(variable_map: dict[str, Any]): - hotel_geo = variable_map["hotel_geo"] - lat, lon = variable_map["lat"], variable_map["lon"] - dist = hotel_geo.distance_km(lat, lon) - return (dist, variable_map["hotel_key"]) - -# after list comprehension -def get_recs_if_body_2(variable_map: dict[str, Any]): - distances = variable_map["distances"] - min_dist = min(distances, key=lambda x: x[0])[0] - variable_map["res"] = [hotel for dist, hotel in distances if dist == min_dist] - - -def get_recs_elif_cond(variable_map: dict[str, Any]): - return variable_map["requirement"] == "price" - - -# list comprehension entry -def get_recs_elif_body_0(variable_map: dict[str, Any]): - pass - - -# list comprehension body -def get_recs_elif_body_1(variable_map: dict[str, Any]): - return (variable_map["hotel_price"], variable_map["hotel_key"]) - -# after list comprehension -def get_recs_elif_body_2(variable_map: dict[str, Any]): - prices = variable_map["prices"] - min_price = min(prices, key=lambda x: x[0])[0] - variable_map["res"] = [hotel for price, hotel in prices if price == min_price] - - - -# a future optimization might instead duplicate this piece of code over the two -# branches, in order to reduce the number of splits by one -def get_recs_final(variable_map: dict[str, Any]): - return variable_map["res"] - - -recommend_op = StatelessOperator({ - "get_recs_if_cond": get_recs_if_cond, - "get_recs_if_body_0": get_recs_if_body_0, - "get_recs_if_body_1": get_recs_if_body_1, - "get_recs_if_body_2": get_recs_if_body_2, - "get_recs_elif_cond": get_recs_elif_cond, - "get_recs_elif_body_0": get_recs_elif_body_0, - "get_recs_elif_body_1": get_recs_elif_body_1, - "get_recs_elif_body_2": get_recs_elif_body_2, - "get_recs_final": get_recs_final, -}, None) - -def get_recommendations_df(): - df = DataFlow("get_recommendations") - n1 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_cond"), is_conditional=True) - n2 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_body_0")) - n3 = OpNode(Hotel, InvokeMethod("get_geo"), assign_result_to="hotel_geo", read_key_from="hotel_key") - n4 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_body_1"), assign_result_to="distance") - n5 = CollectNode("distances", "distance") - n6 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_if_body_2")) - ns1 = SelectAllNode(Hotel, n5, assign_key_to="hotel_key") - - n7 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_cond"), is_conditional=True) - n8 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_body_0")) - n9 = OpNode(Hotel, InvokeMethod("get_price"), assign_result_to="hotel_price", read_key_from="hotel_key") - n10 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_body_1"), assign_result_to="price") - n11 = CollectNode("prices", "price") - n12 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_elif_body_2")) - ns2 = SelectAllNode(Hotel, n11, assign_key_to="hotel_key") - - - n13 = StatelessOpNode(recommend_op, InvokeMethod("get_recs_final")) - - df.add_edge(Edge(n1, ns1, if_conditional=True)) - df.add_edge(Edge(n1, n7, if_conditional=False)) - df.add_edge(Edge(n7, ns2, if_conditional=True)) - df.add_edge(Edge(n7, n13, if_conditional=False)) - - # if branch - df.add_edge(Edge(ns1, n2)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.add_edge(Edge(n4, n5)) - df.add_edge(Edge(n5, n6)) - df.add_edge(Edge(n6, n13)) - - # elif branch - df.add_edge(Edge(ns2, n8)) - df.add_edge(Edge(n8, n9)) - df.add_edge(Edge(n9, n10)) - df.add_edge(Edge(n10, n11)) - df.add_edge(Edge(n11, n12)) - df.add_edge(Edge(n12, n13)) - - df.entry = n1 - return df - -recommend_op.dataflow = get_recommendations_df() \ No newline at end of file diff --git a/test_programs/expected/deathstar_search.py b/test_programs/expected/deathstar_search.py deleted file mode 100644 index cd20593..0000000 --- a/test_programs/expected/deathstar_search.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Any - -from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -from cascade.dataflow.operator import StatelessOperator - -# predicate 1 -def search_nearby_compiled_0(variable_map: dict[str, Any]): - pass - -# predicate 2 -def search_nearby_compiled_1(variable_map: dict[str, Any]): - hotel_geo: Geo = variable_map["hotel_geo"] - lat, lon = variable_map["lat"], variable_map["lon"] - dist = hotel_geo.distance_km(lat, lon) - variable_map["dist"] = dist - return dist < 10 - - -# body -def search_nearby_compiled_2(variable_map: dict[str, Any]): - return (variable_map["dist"], variable_map["hotel_key"]) - -# next line -def search_nearby_compiled_3(variable_map: dict[str, Any]): - distances = variable_map["distances"] - hotels = [hotel for dist, hotel in sorted(distances)[:5]] - return hotels - - -search_op = StatelessOperator({ - "search_nearby_compiled_0": search_nearby_compiled_0, - "search_nearby_compiled_1": search_nearby_compiled_1, - "search_nearby_compiled_2": search_nearby_compiled_2, - "search_nearby_compiled_3": search_nearby_compiled_3, -}, None) - -def search_nearby_df(): - df = DataFlow("search_nearby") - n1 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_0")) - n2 = OpNode(Hotel, InvokeMethod("get_geo"), assign_result_to="hotel_geo", read_key_from="hotel_key") - n3 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_1"), is_conditional=True) - n4 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_2"), assign_result_to="search_body") - n5 = CollectNode("distances", "search_body") - n0 = SelectAllNode(Hotel, n5, assign_key_to="hotel_key") - - n6 = StatelessOpNode(search_op, InvokeMethod("search_nearby_compiled_3")) - - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - - # if true make the body - df.add_edge(Edge(n3, n4, if_conditional=True)) - df.add_edge(Edge(n4, n5)) - # if false skip past - df.add_edge(Edge(n3, n5, if_conditional=False)) - - df.add_edge(Edge(n5, n6)) - - df.entry = n0 - return df - -search_op.dataflow = search_nearby_df() \ No newline at end of file diff --git a/test_programs/expected/deathstar_user.py b/test_programs/expected/deathstar_user.py deleted file mode 100644 index 64985ea..0000000 --- a/test_programs/expected/deathstar_user.py +++ /dev/null @@ -1,57 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import DataFlow, Edge, InvokeMethod, OpNode -from cascade.dataflow.operator import StatefulOperator - -def order_compiled_entry_0(variable_map: dict[str, Any], state: User) -> Any: - pass - -def order_compiled_entry_1(variable_map: dict[str, Any], state: User) -> Any: - pass - -def order_compiled_if_cond(variable_map: dict[str, Any], state: User) -> Any: - return variable_map["hotel_reserve"] and variable_map["flight_reserve"] - -def order_compiled_if_body(variable_map: dict[str, Any], state: User) -> Any: - return True - -def order_compiled_else_body(variable_map: dict[str, Any], state: User) -> Any: - return False - -user_op = StatefulOperator( - User, - { - "order_compiled_entry_0": order_compiled_entry_0, - "order_compiled_entry_1": order_compiled_entry_1, - "order_compiled_if_cond": order_compiled_if_cond, - "order_compiled_if_body": order_compiled_if_body, - "order_compiled_else_body": order_compiled_else_body - }, - {} -) - -# For now, the dataflow will be serial instead of parallel (calling hotel, then -# flight). Future optimizations could try to automatically parallelize this. -# There could definetly be some slight changes to this dataflow depending on -# other optimizations aswell. (A naive system could have an empty first entry -# before the first entity call). -def user_order_df(): - df = DataFlow("user_order") - n0 = OpNode(User, InvokeMethod("order_compiled_entry_0"), read_key_from="user_key") - n1 = OpNode(Hotel, InvokeMethod("reserve"), assign_result_to="hotel_reserve", read_key_from="hotel_key") - n2 = OpNode(User, InvokeMethod("order_compiled_entry_1"), read_key_from="user_key") - n3 = OpNode(Flight, InvokeMethod("reserve"), assign_result_to="flight_reserve", read_key_from="flight_key") - n4 = OpNode(User, InvokeMethod("order_compiled_if_cond"), is_conditional=True, read_key_from="user_key") - n5 = OpNode(User, InvokeMethod("order_compiled_if_body"), read_key_from="user_key") - n6 = OpNode(User, InvokeMethod("order_compiled_else_body"), read_key_from="user_key") - - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.add_edge(Edge(n4, n5, if_conditional=True)) - df.add_edge(Edge(n4, n6, if_conditional=False)) - - df.entry = n0 - return df - -user_op.dataflows["order"] = user_order_df() diff --git a/test_programs/target/__init__.py b/test_programs/target/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_programs/target/checkout_item.py b/test_programs/target/checkout_item.py deleted file mode 100644 index 4bbca40..0000000 --- a/test_programs/target/checkout_item.py +++ /dev/null @@ -1,21 +0,0 @@ -import cascade - -@cascade.cascade -class User: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance - - def buy_item(self, item: 'Item') -> bool: - item_price = item.get_price() # SSA - self.balance -= item_price - return self.balance >= 0 - -@cascade.cascade -class Item: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price - - def get_price(self) -> int: - return self.price \ No newline at end of file diff --git a/test_programs/target/checkout_two_items.py b/test_programs/target/checkout_two_items.py deleted file mode 100644 index f6f6278..0000000 --- a/test_programs/target/checkout_two_items.py +++ /dev/null @@ -1,23 +0,0 @@ -import cascade - -@cascade.cascade -class User: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance - - def buy_two_items(self, item_1: 'Item', item_2: 'Item') -> bool: - item_price_1 = item_1.get_price() - item_price_2 = item_2.get_price() - total_price = item_price_1 + item_price_2 - self.balance -= total_price - return self.balance >= 0 - -@cascade.cascade -class Item: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price - - def get_price(self) -> int: - return self.price \ No newline at end of file diff --git a/test_programs/target/deathstar_recommendation.py b/test_programs/target/deathstar_recommendation.py deleted file mode 100644 index 5d6f12d..0000000 --- a/test_programs/target/deathstar_recommendation.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Literal -import cascade - -# Stateless -@cascade.cascade -class Recommendation(): - @staticmethod - def get_recommendations(requirement: Literal["distance", "price"], lat: float, lon: float) -> list[Hotel]: - if requirement == "distance": - distances = [(hotel.geo.distance_km(lat, lon), hotel) - for hotel in Hotel.__all__()] - min_dist = min(distances, key=lambda x: x[0]) - res = [hotel for dist, hotel in distances if dist == min_dist] - elif requirement == "price": - prices = [(hotel.price, hotel) - for hotel in Hotel.__all__()] - min_price = min(prices, key=lambda x: x[0]) - res = [hotel for rate, hotel in prices if rate == min_price] - - return res \ No newline at end of file diff --git a/test_programs/target/deathstar_search.py b/test_programs/target/deathstar_search.py deleted file mode 100644 index 845e709..0000000 --- a/test_programs/target/deathstar_search.py +++ /dev/null @@ -1,14 +0,0 @@ -import cascade - -# Stateless -@cascade.cascade -class Search(): - # Get the 5 nearest hotels - @staticmethod - def nearby(lat: float, lon: float, in_date: int, out_date: int): - distances = [ - (dist, hotel) - for hotel in Hotel.__all__() - if (dist := hotel.geo.distance_km(lat, lon)) < 10] - hotels = [hotel for dist, hotel in sorted(distances)[:5]] - return hotels \ No newline at end of file diff --git a/test_programs/target/deathstar_user.py b/test_programs/target/deathstar_user.py deleted file mode 100644 index dd87723..0000000 --- a/test_programs/target/deathstar_user.py +++ /dev/null @@ -1,16 +0,0 @@ -import cascade - -@cascade.cascade -class User(): - def __init__(self, user_id: str, password: str): - self.id = user_id - self.password = password - - def check(self, password): - return self.password == password - - def order(self, flight: Flight, hotel: Hotel): - if hotel.reserve() and flight.reserve(): - return True - else: - return False \ No newline at end of file diff --git a/tests/frontend/ast_visitors/test_self_rename.py b/tests/frontend/ast_visitors/test_self_rename.py new file mode 100644 index 0000000..0cae724 --- /dev/null +++ b/tests/frontend/ast_visitors/test_self_rename.py @@ -0,0 +1,60 @@ +from cascade.frontend.ast_visitors.replace_name import ReplaceSelfWithState +from cascade.preprocessing import setup_cfg +from klara.core import nodes + +def test_replace_self_with_state(): + code = "self.balance = self.balance + 10" + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + assert isinstance(tree, nodes.Module) + node = tree.body[0] + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.BinOp) + assert str(node.targets[0]) == "__state['balance']" + assert str(node.value.left) == "__state['balance']" + +def test_replace_self_with_state_dict(): + code = "self.data['b'] = self.data['a'] + self.balance" + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + + assert isinstance(tree, nodes.Module) + node = tree.body[0] + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.BinOp) + assert str(node.targets[0]) == "__state['data']['b']" + assert str(node.value.left) == "__state['data']['a']" + assert str(node.value.right) == "__state['balance']" + +def test_replace_self_assign(): + code = "__ret_2 = self.price" + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + + assert isinstance(tree, nodes.Module) + node = tree.body[0] + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.Subscript), type(node.value) + assert str(node.targets[0]) == "__ret_2" + assert str(node.value) == "__state['price']" + print(str(node)) + +def test_replace_self_assign_after_return(): + code = "__ret_2 = self.price" + cfg, tree = setup_cfg(code, preprocess=False) + ReplaceSelfWithState.replace(tree) + + assert isinstance(tree, nodes.Module) + node = tree.body[0] + assert isinstance(node, nodes.Assign) + assert isinstance(node.targets, list) + assert isinstance(node.value, nodes.Subscript), type(node.value) + assert str(node.targets[0]) == "__ret_2" + assert str(node.value) == "__state['price']" + print(str(node)) \ No newline at end of file diff --git a/tests/frontend/ast_visitors/test_simplify_returns.py b/tests/frontend/ast_visitors/test_simplify_returns.py new file mode 100644 index 0000000..02199b7 --- /dev/null +++ b/tests/frontend/ast_visitors/test_simplify_returns.py @@ -0,0 +1,60 @@ +from cascade.frontend.ast_visitors.simplify_returns import SimplifyReturns, simplify_returns +from cascade.frontend.generator.unparser import unparse +from cascade.preprocessing import setup_cfg +from klara.core import nodes +from klara.core.tree_rewriter import AstBuilder +from klara.core.cfg import Cfg + +def setup_cfg_no_ssa(code: str) -> Cfg: + as_tree = AstBuilder().string_build(code) + cfg = Cfg(as_tree) + return cfg, as_tree + +def test_simplify_return_state(): + code = "return self.balance" + cfg, tree = setup_cfg_no_ssa(code) + for s in tree.get_statements(): + print(repr(s)) + sr = SimplifyReturns.replace(tree) + simplify_returns(tree) + + for s in tree.get_statements(): + print(repr(s)) + +def test_simplify_return_name(): + code = "return cat" + cfg, tree = setup_cfg_no_ssa(code) + for s in tree.get_statements(): + print(repr(s)) + sr = SimplifyReturns.replace(tree) + simplify_returns(tree) + + for s in tree.get_statements(): + print(repr(s)) + +def test_simplify_return_binop(): + code = """a = 1 +return 4+1""" + cfg, tree = setup_cfg_no_ssa(code) + + for s in tree.get_statements(): + print(repr(s)) + simplify_returns(tree) + + for s in tree.get_statements(): + print(repr(s)) + +def test_simplify_return_multiple(): + code = """a = 1 +if a == 1: + return 3 + 2 +else: + return a""" + cfg, tree = setup_cfg_no_ssa(code) + + for b in tree.get_statements(): + print(repr(b)) + simplify_returns(tree) + + for b in tree.get_statements(): + print(repr(b)) \ No newline at end of file diff --git a/tests/frontend/ast_visitors/test_variable_getter.py b/tests/frontend/ast_visitors/test_variable_getter.py index d38553d..ea168f4 100644 --- a/tests/frontend/ast_visitors/test_variable_getter.py +++ b/tests/frontend/ast_visitors/test_variable_getter.py @@ -1,12 +1,10 @@ -from cascade.frontend.util import setup_cfg +from cascade.preprocessing import setup_cfg from cascade.frontend.ast_visitors.variable_getter import VariableGetter -from klara.core.tree_rewriter import AstBuilder - def test_variable_getter(): code = "item_price = item.get_price()" - cfg = setup_cfg(code) + cfg, _ = setup_cfg(code) ssa_code = cfg.block_list[1].ssa_code node, = ssa_code.code_list variable_getter = VariableGetter.get_variable(node) @@ -14,4 +12,15 @@ def test_variable_getter(): values_as_string = [repr(v) for v in variable_getter.values] assert targets_as_string == ['item_price_0'] assert values_as_string == ['item'] - \ No newline at end of file + + +def test_variable_getter_attr(): + code = "self.balance = self.balance + 1" + cfg, _ = setup_cfg(code, preprocess=False) + ssa_code = cfg.block_list[1].ssa_code + node, = ssa_code.code_list + variable_getter = VariableGetter.get_variable(node) + targets_as_string = [repr(t) for t in variable_getter.targets] + values_as_string = [repr(v) for v in variable_getter.values] + assert targets_as_string == ['self'] + assert values_as_string == ['self'] \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_branches.py b/tests/frontend/dataflow_analysis/test_branches.py new file mode 100644 index 0000000..f2bc08e --- /dev/null +++ b/tests/frontend/dataflow_analysis/test_branches.py @@ -0,0 +1,167 @@ +from textwrap import dedent + +from cascade.dataflow.dataflow import DataFlow, DataflowRef, IfNode +from cascade.frontend.generator.dataflow_builder import DataflowBuilder +from cascade.preprocessing import setup_cfg +from klara.core import nodes + + +def test_easy_branching(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond = self.balance - item_price >= 0 + if cond: + self.balance = self.balance - item_price + else: + x = 10 + return self.balance""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 7 + ifnode = None + for node in df.nodes.values(): + if isinstance(node, IfNode): + assert ifnode is None + ifnode = node + + assert ifnode is not None + assert len(ifnode.outgoing_edges) == 2 + + +def test_complex_predicate(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + if self.balance >= item_price: + self.balance = self.balance - item_price + else: + x = 10 + return self.balance""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 6, "complex predicate should create a temp variable assignment" + + +def test_multiple_return(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond = self.balance - item_price >= 0 + if cond: + item_price = item.get_price() + self.balance = self.balance - item_price + return "item bought" + else: + item_price = item.get_price() + msg = str(item_price) + " is too expensive!" + return msg""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + +def test_no_else(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond1 = self.balance - item_price >= 0 + if cond1: + item_price = item.get_price() + self.balance = self.balance - item_price + x = 0 + return item_price""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 6 + +def test_nested(): + program: str = dedent(""" + class User: + def buy_item(self, item: 'Item') -> int: + item_price = item.get_price() + cond1 = self.balance - item_price >= 0 + if cond1: + item_price = item.get_price() + if True: + x = 20 + self.balance = self.balance - item_price + return "item bought" + else: + if True: + x = 20 + else: + x = 30 + item_price = item.get_price() + msg = "item is too expensive!" + return msg""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("User", "__init__"): DataFlow("__init__", "User", ["username", "balance"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + print(df.to_dot()) + assert len(df.nodes) == 12 \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py index eeebf60..68c5a9b 100644 --- a/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py +++ b/tests/frontend/dataflow_analysis/test_dataflow_graph_builder.py @@ -1,51 +1,81 @@ from textwrap import dedent -import networkx as nx - from klara.core.cfg import Cfg from klara.core import nodes -from cascade.frontend.dataflow_analysis.dataflow_graph_builder import DataflowGraphBuilder -from cascade.frontend.intermediate_representation import Statement, StatementDataflowGraph -from cascade.frontend.util import setup_cfg +from cascade.frontend.cfg.cfg_builder import ControlFlowGraphBuilder +from cascade.frontend.cfg import Statement, ControlFlowGraph +from cascade.preprocessing import setup_cfg -def get_statment(df: StatementDataflowGraph, v: nodes.Statement): - return next(s for s in df.graph.nodes if s.block == v) +def test_linear_program(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock): + q1 = item1.get_quantity() + q2 = item2.get_quantity() + total = Adder.add(q1, q2) + return total""") + + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + # TODO: check that the produced ssa code made variables for + # - item1.get_quantity() + # - item2.get_quantity() + df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body, globals=[]) + for n in df.graph.nodes: + print(n) + for u, v in df.graph.edges: + print(u.block_num, v.block_num) + # print(df.graph.edges) -def edge_exists_between(df: StatementDataflowGraph, v: nodes.Statement, n: nodes.Statement): - statement_v: Statement = get_statment(df, v) - statement_n: Statement = get_statment(df, n) - assert (statement_v, statement_n) in df.graph.edges +def test_ssa(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock): + total = Adder.add(item1.get_quantity(), item2.get_quantity()) + return total""") + + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + # TODO: check that the produced ssa code made variables for + # - item1.get_quantity() + # - item2.get_quantity() + df: ControlFlowGraph = ControlFlowGraphBuilder.build([get_total] + get_total.body, globals=[]) + print(df.graph.nodes) + print(df.graph.edges) -def assert_expected_edges(df, expected_edges): - edges: list[nodes.Statement] = [(n.block, v.block) for n,v in df.graph.edges] - assert edges == expected_edges -# TODO: FOr instance in the example below there is a indirect dependency between update balence and -# returning the balence >= 0. (side effect dependency) -def test_simple_dataflow_graph(): +def test_if_else_branches(): program: str = dedent(""" - class User: - - def buy_item(self, item: 'Item') -> bool: - item_price = item.get_price() - self.balance -= item_price - return self.balance >= 0 - """) - cfg: Cfg = setup_cfg(program) + class Test: + + def test_branches(item1: Stock, item2: Stock): + q = item1.get_quantity() + cond = q < 10 + if cond: + a = item2.get_quantity() + else: + a = 0 + return a""") + + cfg, _ = setup_cfg(program) blocks = cfg.block_list - user_class: nodes.Block = blocks[2] - buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] - buy_item_body_0 = buy_item.body[0] - buy_item_body_1 = buy_item.body[1] - buy_item_body_2 = buy_item.body[2] - df: StatementDataflowGraph = DataflowGraphBuilder.build([buy_item] + buy_item.body) - expected_edges = [ - (buy_item, buy_item_body_0), - (buy_item, buy_item_body_1), - (buy_item, buy_item_body_2), - (buy_item_body_0, buy_item_body_1) - ] - assert_expected_edges(df, expected_edges) + print(blocks) + test_class: nodes.Block = blocks[2] + test: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + # TODO: check that the produced ssa code made variables for + # - item1.get_quantity() + # - item2.get_quantity() + df: ControlFlowGraph = ControlFlowGraphBuilder.build([test] + test.body, globals=[]) + # print(df.graph.nodes) + # print(df.graph.edges) \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_entities.py b/tests/frontend/dataflow_analysis/test_entities.py new file mode 100644 index 0000000..6a3ffca --- /dev/null +++ b/tests/frontend/dataflow_analysis/test_entities.py @@ -0,0 +1,196 @@ +from dataclasses import dataclass +from textwrap import dedent + + +from klara.core.cfg import Cfg +from klara.core import nodes + +from cascade.dataflow.dataflow import CallRemote, CallLocal, DataFlow, DataflowRef + +from cascade.frontend.generator.dataflow_builder import DataflowBuilder +from cascade.preprocessing import setup_cfg + +def test_call_entity(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock): + a = item1.get_quantity() + b = item2.get_quantity() + return a+b""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + sf.build_cfg() + + dataflows = { + DataflowRef("Test", "get_total"): DataFlow("get_total", "Test", ["item1", "item2"]), + DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Stock", []) + } + + df = sf.build(dataflows, "Test") + + ## TODO: check blocks/df + assert len(df.nodes) == 4 + assert len(df.entry) == 1 + entry = df.entry[0] + assert isinstance(entry, CallRemote) + next = df.get_neighbors(entry) + assert len(next) == 1 + next = next[0] + assert isinstance(next, CallRemote) + next = df.get_neighbors(next) + assert len(next) == 1 + next = next[0] + assert isinstance(next, CallLocal) + + +def test_simple_block(): + program: str = dedent(""" + class Test: + + def add(x: int, y: int): + return x+y""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + + dataflows = { + DataflowRef("Test", "add"): DataFlow("get_total", "Test", ["x", "y"]), + } + + df = sf.build(dataflows, "Test") + + assert len(df.blocks) == 1 + block = list(df.blocks.values())[0] + print(block.function_string) + var_map = {"x_0": 3, "y_0":5 } + block.call_block(var_map, None) + assert sorted(list(var_map.values())) == [3, 5, 8] + + +def test_state(): + program = dedent(""" +class User: + def buy_item(self, item: 'Item') -> bool: + item_price = item.get_price() # SSA + self.balance = self.balance - item_price + return self.balance >= 0 +""") + + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + user_class = blocks[2] + buy_item: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(buy_item) + + dataflows = { + DataflowRef("User", "buy_item"): DataFlow("buy_item", "User", ["item"]), + DataflowRef("Item", "get_price"): DataFlow("get_price", "Item", []), + } + + df = sf.build(dataflows, "User") + + blocks = list(df.blocks.values()) + + assert len(blocks) == 1 + func = blocks[0].call_block + print(blocks[0].function_string) + + @dataclass + class User: + username: str + balance: int + + user = User("a", 20) + func({"item_price_0": 10}, user.__dict__) + assert user.balance == 10 + + func({"item_price_0": 13}, user.__dict__) + assert user.balance == -3 + +def test_dict_state(): + program = dedent(""" +class ComposeReview: + def upload_unique_id(self, review_id: int): + self.review_data["review_id"] = review_id +""") + + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + user_class = blocks[2] + upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(upload_unique) + + dataflows = { + DataflowRef("ComposeReview", "upload_unique_id"): DataFlow("upload_unique_id", "ComposeReview", ["review_id"]), + DataflowRef("ComposeReview", "__init__"): DataFlow("__init__", "ComposeReview", ["req_id"]), + } + + df = sf.build(dataflows, "ComposeReview") + + + blocks = list(df.blocks.values()) + assert len(blocks) == 1 + + + @dataclass + class ComposeReview: + req_id: str + review_data: dict + + func = blocks[0].call_block + + print(blocks[0].function_string) + + compose_review = ComposeReview("req", {}) + func({"review_id_0": 123}, compose_review.__dict__) + assert compose_review.review_data["review_id"] == 123 + + +def test_import(): + program = dedent(""" +class Randomer: + @staticmethod + def rand(): + r = random.random() + return r +""") + + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + user_class = blocks[2] + upload_unique: nodes.FunctionDef = user_class.blocks[1].ssa_code.code_list[0] + + import random + sf = DataflowBuilder(upload_unique, {'random': random}) + sf.build_cfg() + for node in sf.cfg.get_nodes(): + print(node) + + dataflows = { + DataflowRef("Randomer", "rand"): DataFlow("rand", "Randomer", []), + } + + df = sf.build(dataflows, "Randomer") + + for block in df.blocks.values(): + print(block.function_string) + + rands = set() + for _ in range(10): + var_map = {} + df.blocks['rand_0'].call_block(variable_map=var_map, __state=None) + assert len(var_map) == 1 + r = var_map['r_0'] + rands.add(r) + + assert len(rands) == 10 \ No newline at end of file diff --git a/tests/frontend/dataflow_analysis/test_split_functions.py b/tests/frontend/dataflow_analysis/test_split_functions.py new file mode 100644 index 0000000..8f25dfb --- /dev/null +++ b/tests/frontend/dataflow_analysis/test_split_functions.py @@ -0,0 +1,155 @@ +from textwrap import dedent + +import networkx as nx + +from klara.core.cfg import Cfg +from klara.core import nodes + +from cascade.dataflow.dataflow import DataFlow, DataflowRef +from cascade.frontend.generator.dataflow_builder import DataflowBuilder, blocked_cfg, split_cfg +from cascade.frontend.cfg.control_flow_graph import ControlFlowGraph +from cascade.preprocessing import setup_cfg + +def test_entity_calls(): + program: str = dedent(""" + class Test: + + def get_total(item1: Stock, item2: Stock, y: int): + a = 10 + b = a + 3 + x = item1.get_quantity() + y = item2.get_quantity() + total = Adder.add(x, y) + total = total + a + b + total = total - 23 + return total""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + + sf = DataflowBuilder(get_total) + sf.build_cfg() + + dataflows = { + DataflowRef("Adder", "add"): DataFlow("add", "Adder", ["a", "b"]), + DataflowRef("Stock", "get_quantity"): DataFlow("get_quantity", "Item", []), + DataflowRef("Test", "get_total"): DataFlow("get_total", "Test", []) + } + + + + df = sf.build_df(dataflows, "Test") + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + + # TODO: Check # entity calls, # of local calls + assert len(df.nodes) == 6 + assert len(df.blocks) == 2 + +def test_branching(): + program: str = dedent(""" + class Test: + def test_branching(self) -> int: + pre = 10 + if True: + then = 20 + and_then = 10 + else: + orelse = 30 + orelser = 30 + post = 40 + return 50""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + sf.build_cfg() + print(sf.cfg.to_dot()) + new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) + + print_digraph(new) + + print_digraph(split_cfg(new)) + + assert len(new.nodes) == 5 + + dataflows = { + DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []) + } + + + df = sf.build_df(dataflows, "Test") + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + assert len(df.nodes) == 6 + assert len(df.blocks) == 4 + +def print_digraph(graph: nx.DiGraph): + for node in graph.nodes: + for s in node: + print(s.block_num, end=" ") + print() + for u, v, c in graph.edges.data('type', default=None): + print(u[0].block_num, end=" ") + print("->", end= " ") + print(v[0].block_num, end=" ") + if c is not None: + print(f' [label="{c}"]', end=" ") + print() + +def test_branching_with_entity_calls(): + program: str = dedent(""" + class Test: + def test_branching(self) -> int: + pre = 10 + if True: + then = 10 + and_then = 10 + else: + orelse = 30 + y = 10 + orelser = Entity.call() + orelserer = 40 + x = 10 + post = 40 + return 50""") + cfg, _ = setup_cfg(program) + blocks = cfg.block_list + test_class: nodes.Block = blocks[2] + get_total: nodes.FunctionDef = test_class.blocks[1].ssa_code.code_list[0] + + sf = DataflowBuilder(get_total) + sf.build_cfg() + print(sf.cfg.to_dot()) + new = blocked_cfg(sf.cfg.graph, sf.cfg.get_single_source()) + + assert len(list(new.nodes)) == 5 + print(new.nodes) + new_split = split_cfg(new) + print(new_split.nodes) + assert len(list(new_split.nodes)) == 8 + + dataflows = { + DataflowRef("Test", "test_branching"): DataFlow("test_branching", "Test", []), + DataflowRef("Entity", "call"): DataFlow("call", "Entity", []) + } + + + # TODO: Check # entity calls, # of blocks, # of local calls + + df = sf.build_df(dataflows, "Test") + print(df.to_dot()) + for block in df.blocks.values(): + print(block.function_string) + + assert len(df.nodes) == 8 + assert len(df.blocks) == 5 + +def test_block_merging(): + raise NotImplementedError() \ No newline at end of file diff --git a/tests/frontend/test_frontend.py b/tests/frontend/test_frontend.py deleted file mode 100644 index b5f16f7..0000000 --- a/tests/frontend/test_frontend.py +++ /dev/null @@ -1,5 +0,0 @@ -def test_ifs(): - pass - -def test_whiles(): - pass \ No newline at end of file diff --git a/tests/integration/branching.py b/tests/integration/branching.py new file mode 100644 index 0000000..e4fa373 --- /dev/null +++ b/tests/integration/branching.py @@ -0,0 +1,32 @@ +import cascade + +@cascade.cascade +class Brancher: + @staticmethod + def branch(cond: bool) -> int: + x = 10 + if cond: + r = Remote.get() + return r + else: + return 42 + + @staticmethod + def branch_insta(cond: bool) -> int: + if cond: + r = Remote.get() + return r + else: + return 42 + + +@cascade.cascade +class Remote: + @staticmethod + def get() -> int: + return 33 + + + + + \ No newline at end of file diff --git a/tests/integration/common.py b/tests/integration/common.py new file mode 100644 index 0000000..4afd1e6 --- /dev/null +++ b/tests/integration/common.py @@ -0,0 +1,36 @@ +import cascade + +@cascade.cascade +class User: + def __init__(self, key: str, balance: int): + self.key: str = key + self.balance: int = balance + + def update_balance(self, amount: int) -> bool: + self.balance = self.balance + amount + return self.balance >= 0 + + def get_balance(self) -> int: + return self.balance + + def buy_item(self, item: 'Item') -> bool: + item_price = item.get_price() # SSA + self.balance = self.balance - item_price + return self.balance >= 0 + + def buy_2_items(self, item1: 'Item', item2: 'Item') -> bool: + item1_price = item1.get_price() # SSA + item2_price = item2.get_price() # SSA + total = item1_price + item2_price + self.balance = self.balance - total + return self.balance >= 0 + + +@cascade.cascade +class Item: + def __init__(self, key: str, price: int): + self.key: str = key + self.price: int = price + + def get_price(self) -> int: + return self.price \ No newline at end of file diff --git a/tests/integration/flink-runtime/common.py b/tests/integration/flink-runtime/common.py deleted file mode 100644 index a7d7af6..0000000 --- a/tests/integration/flink-runtime/common.py +++ /dev/null @@ -1,128 +0,0 @@ -from typing import Any -from cascade.dataflow.dataflow import CollectNode, CollectTarget, DataFlow, Edge, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import StatefulOperator - -class User: - def __init__(self, key: str, balance: int): - self.key: str = key - self.balance: int = balance - - def update_balance(self, amount: int) -> bool: - self.balance += amount - return self.balance >= 0 - - def get_balance(self) -> int: - return self.balance - - def buy_item(self, item: 'Item') -> bool: - item_price = item.get_price() # SSA - self.balance -= item_price - return self.balance >= 0 - - def buy_2_items(self, item1: 'Item', item2: 'Item') -> bool: - item1_price = item1.get_price() # SSA - item2_price = item2.get_price() # SSA - self.balance -= item1_price + item2_price - return self.balance >= 0 - - def __repr__(self): - return f"User(key='{self.key}', balance={self.balance})" - -class Item: - def __init__(self, key: str, price: int): - self.key: str = key - self.price: int = price - - def get_price(self) -> int: - return self.price - - def __repr__(self): - return f"Item(key='{self.key}', price={self.price})" - -def update_balance_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance += variable_map["amount"] - return state.balance >= 0 - -def get_balance_compiled(variable_map: dict[str, Any], state: User) -> Any: - return state.balance - -def get_price_compiled(variable_map: dict[str, Any], state: Item) -> Any: - return state.price - -def buy_item_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - -def buy_item_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance = state.balance - variable_map["item_price"] - return state.balance >= 0 - - -def buy_2_items_0_compiled(variable_map: dict[str, Any], state: User) -> Any: - return None - -def buy_2_items_1_compiled(variable_map: dict[str, Any], state: User) -> Any: - state.balance -= variable_map["item_prices"][0] + variable_map["item_prices"][1] - return state.balance >= 0 - -# An operator is defined by the underlying class and the functions that can be called -user_op = StatefulOperator( - User, - { - "update_balance": update_balance_compiled, - "get_balance": get_balance_compiled, - "buy_item_0": buy_item_0_compiled, - "buy_item_1": buy_item_1_compiled, - "buy_2_items_0": buy_2_items_0_compiled, - "buy_2_items_1": buy_2_items_1_compiled - }, - None) - -item_op = StatefulOperator( - Item, {"get_price": get_price_compiled}, None -) - - -def user_buy_item_df(): - df = DataFlow("user.buy_item") - n0 = OpNode(User, InvokeMethod("buy_item_0"), read_key_from="user_key") - n1 = OpNode(Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - read_key_from="item_key") - n2 = OpNode(User, InvokeMethod("buy_item_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n1, n2)) - df.entry = n0 - return df - -def user_buy_2_items_df(): - df = DataFlow("user.buy_2_items") - n0 = OpNode(User, InvokeMethod("buy_2_items_0"), read_key_from="user_key") - n3 = CollectNode(assign_result_to="item_prices", read_results_from="item_price") - n1 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 0), - read_key_from="item1_key" - ) - n2 = OpNode( - Item, - InvokeMethod("get_price"), - assign_result_to="item_price", - collect_target=CollectTarget(n3, 2, 1), - read_key_from="item2_key" - ) - n4 = OpNode(User, InvokeMethod("buy_2_items_1"), read_key_from="user_key") - df.add_edge(Edge(n0, n1)) - df.add_edge(Edge(n0, n2)) - df.add_edge(Edge(n1, n3)) - df.add_edge(Edge(n2, n3)) - df.add_edge(Edge(n3, n4)) - df.entry = n0 - return df - -user_op.dataflows = { - "buy_2_items": user_buy_2_items_df(), - "buy_item": user_buy_item_df() - } \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_collect_operator.py b/tests/integration/flink-runtime/test_collect_operator.py deleted file mode 100644 index d14418f..0000000 --- a/tests/integration/flink-runtime/test_collect_operator.py +++ /dev/null @@ -1,71 +0,0 @@ -"""A test script for dataflows with merge operators""" - -from pyflink.datastream.data_stream import CloseableIterator -from common import Item, User, item_op, user_op -from cascade.dataflow.dataflow import Event, EventResult, InitClass, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime -import pytest - -@pytest.mark.integration -def test_merge_operator(): - runtime = FlinkRuntime("test_collect_operator") - runtime.init() - runtime.add_operator(item_op) - runtime.add_operator(user_op) - - - # Create a User object - foo_user = User("foo", 100) - init_user_node = OpNode(User, InitClass(), read_key_from="key") - event = Event(init_user_node, {"key": "foo", "balance": 100}, None) - runtime.send(event) - - # Create an Item object - fork_item = Item("fork", 5) - init_item_node = OpNode(Item, InitClass(), read_key_from="key") - event = Event(init_item_node, {"key": "fork", "price": 5}, None) - runtime.send(event) - - # Create another Item - spoon_item = Item("spoon", 3) - event = Event(init_item_node, {"key": "spoon", "price": 3}, None) - runtime.send(event, flush=True) - - collected_iterator: CloseableIterator = runtime.run(run_async=True, output="collect") - records = [] - - def wait_for_event_id(id: int) -> EventResult: - for record in collected_iterator: - records.append(record) - print(f"Collected record: {record}") - if record.event_id == id: - return record - - # Make sure the user & items are initialised - wait_for_event_id(event._id) - - # Have the User object buy the item - foo_user.buy_2_items(fork_item, spoon_item) - df = user_op.dataflows["buy_2_items"] - - # User with key "foo" buys item with key "fork" - user_buys_cutlery = Event(df.entry, {"user_key": "foo", "item1_key": "fork", "item2_key": "spoon"}, df) - runtime.send(user_buys_cutlery, flush=True) - - - # Check that we were able to buy the fork - buy_fork_result = wait_for_event_id(user_buys_cutlery._id) - assert buy_fork_result.result == True - - # Send an event to check if the balance was updated - user_get_balance_node = OpNode(User, InvokeMethod("get_balance"), read_key_from="key") - user_get_balance = Event(user_get_balance_node, {"key": "foo"}, None) - runtime.send(user_get_balance, flush=True) - - # See that the user's balance has gone down - get_balance = wait_for_event_id(user_get_balance._id) - assert get_balance.result == 92 - - collected_iterator.close() - - print(records) \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_select_all.py b/tests/integration/flink-runtime/test_select_all.py deleted file mode 100644 index 602858d..0000000 --- a/tests/integration/flink-runtime/test_select_all.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -The select all operator is used to fetch all keys for a single entity -""" -import math -import random -from dataclasses import dataclass -from typing import Any - -from pyflink.datastream.data_stream import CloseableIterator - -from cascade.dataflow.dataflow import CollectNode, DataFlow, Edge, Event, EventResult, InitClass, InvokeMethod, OpNode, SelectAllNode, StatelessOpNode -from cascade.dataflow.operator import StatefulOperator, StatelessOperator -from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime, FlinkStatelessOperator -import time -import pytest - -@dataclass -class Geo: - x: int - y: int - -class Hotel: - def __init__(self, name: str, loc: Geo): - self.name = name - self.loc = loc - - def get_name(self) -> str: - return self.name - - def distance(self, loc: Geo) -> float: - return math.sqrt((self.loc.x - loc.x) ** 2 + (self.loc.y - loc.y) ** 2) - - def __repr__(self) -> str: - return f"Hotel({self.name}, {self.loc})" - - -def distance_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: - loc = variable_map["loc"] - return math.sqrt((state.loc.x - loc.x) ** 2 + (state.loc.y - loc.y) ** 2) - -def get_name_compiled(variable_map: dict[str, Any], state: Hotel) -> Any: - return state.name - -hotel_op = StatefulOperator(Hotel, - {"distance": distance_compiled, - "get_name": get_name_compiled}, {}) - - - -def get_nearby(hotels: list[Hotel], loc: Geo, dist: float): - return [hotel.get_name() for hotel in hotels if hotel.distance(loc) < dist] - - -# We compile just the predicate, the select is implemented using a selectall node -def get_nearby_predicate_compiled_0(variable_map: dict[str, Any]): - pass - -def get_nearby_predicate_compiled_1(variable_map: dict[str, Any]) -> bool: - loc = variable_map["loc"] - dist = variable_map["dist"] - hotel_dist = variable_map["hotel_distance"] - return hotel_dist < dist - -def get_nearby_body_compiled_0(variable_map: dict[str, Any]): - pass - -def get_nearby_body_compiled_1(variable_map: dict[str, Any]) -> str: - return variable_map["hotel_name"] - -get_nearby_op = StatelessOperator({ - "get_nearby_predicate_compiled_0": get_nearby_predicate_compiled_0, - "get_nearby_predicate_compiled_1": get_nearby_predicate_compiled_1, - "get_nearby_body_compiled_0": get_nearby_body_compiled_0, - "get_nearby_body_compiled_1": get_nearby_body_compiled_1, -}, None) - -# dataflow for getting all hotels within region -df = DataFlow("get_nearby") -n7 = CollectNode("get_nearby_result", "get_nearby_body") -n0 = SelectAllNode(Hotel, n7, assign_key_to="hotel_key") -n1 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_0")) -n2 = OpNode(Hotel, InvokeMethod("distance"), assign_result_to="hotel_distance", read_key_from="hotel_key") -n3 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_predicate_compiled_1"), is_conditional=True) -n4 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_0")) -n5 = OpNode(Hotel, InvokeMethod("get_name"), assign_result_to="hotel_name", read_key_from="hotel_key") -n6 = StatelessOpNode(get_nearby_op, InvokeMethod("get_nearby_body_compiled_1"), assign_result_to="get_nearby_body") - -df.add_edge(Edge(n0, n1)) -df.add_edge(Edge(n1, n2)) -df.add_edge(Edge(n2, n3)) -df.add_edge(Edge(n3, n4, if_conditional=True)) -df.add_edge(Edge(n3, n7, if_conditional=False)) -df.add_edge(Edge(n4, n5)) -df.add_edge(Edge(n5, n6)) -df.add_edge(Edge(n6, n7)) -get_nearby_op.dataflow = df - -@pytest.mark.integration -def test_nearby_hotels(): - runtime = FlinkRuntime("test_nearby_hotels") - runtime.init() - runtime.add_operator(hotel_op) - runtime.add_stateless_operator(get_nearby_op) - - # Create Hotels - hotels = [] - init_hotel = OpNode(Hotel, InitClass(), read_key_from="name") - random.seed(42) - for i in range(20): - coord_x = random.randint(-10, 10) - coord_y = random.randint(-10, 10) - hotel = Hotel(f"h_{i}", Geo(coord_x, coord_y)) - event = Event(init_hotel, {"name": hotel.name, "loc": hotel.loc}, None) - runtime.send(event) - hotels.append(hotel) - - collected_iterator: CloseableIterator = runtime.run(run_async=True, output='collect') - records = [] - def wait_for_event_id(id: int) -> EventResult: - for record in collected_iterator: - records.append(record) - print(f"Collected record: {record}") - if record.event_id == id: - return record - - def wait_for_n_records(num: int) -> list[EventResult]: - i = 0 - n_records = [] - for record in collected_iterator: - i += 1 - records.append(record) - n_records.append(record) - print(f"Collected record: {record}") - if i == num: - return n_records - - print("creating hotels") - # Wait for hotels to be created - wait_for_n_records(20) - time.sleep(10) # wait for all hotels to be registered - - dist = 5 - loc = Geo(0, 0) - event = Event(n0, {"loc": loc, "dist": dist}, df) - runtime.send(event, flush=True) - - nearby = [] - for hotel in hotels: - if hotel.distance(loc) < dist: - nearby.append(hotel.name) - - event_result = wait_for_event_id(event._id) - results = [r for r in event_result.result if r != None] - print(nearby) - assert set(results) == set(nearby) \ No newline at end of file diff --git a/tests/integration/flink-runtime/test_two_entities.py b/tests/integration/flink-runtime/test_two_entities.py deleted file mode 100644 index 3d89bd2..0000000 --- a/tests/integration/flink-runtime/test_two_entities.py +++ /dev/null @@ -1,75 +0,0 @@ -"""A test script for dataflows with multiple operators""" - -from pyflink.datastream.data_stream import CloseableIterator -from common import Item, User, item_op, user_op -from cascade.dataflow.dataflow import Event, EventResult, InitClass, InvokeMethod, OpNode -from cascade.runtime.flink_runtime import FlinkOperator, FlinkRuntime -import pytest - -@pytest.mark.integration -def test_two_entities(): - runtime = FlinkRuntime("test_two_entities") - runtime.init() - runtime.add_operator(item_op) - runtime.add_operator(user_op) - - # Create a User object - foo_user = User("foo", 100) - init_user_node = OpNode(User, InitClass(), read_key_from="key") - event = Event(init_user_node, {"key": "foo", "balance": 100}, None) - runtime.send(event) - - # Create an Item object - fork_item = Item("fork", 5) - init_item_node = OpNode(Item, InitClass(), read_key_from="key") - event = Event(init_item_node, {"key": "fork", "price": 5}, None) - runtime.send(event) - - # Create an expensive Item - house_item = Item("house", 1000) - event = Event(init_item_node, {"key": "house", "price": 1000}, None) - runtime.send(event) - - # Have the User object buy the item - foo_user.buy_item(fork_item) - df = user_op.dataflows["buy_item"] - - # User with key "foo" buys item with key "fork" - user_buys_fork = Event(df.entry, {"user_key": "foo", "item_key": "fork"}, df) - runtime.send(user_buys_fork, flush=True) - - collected_iterator: CloseableIterator = runtime.run(run_async=True, output="collect") - records = [] - - def wait_for_event_id(id: int) -> EventResult: - for record in collected_iterator: - records.append(record) - print(f"Collected record: {record}") - if record.event_id == id: - return record - - # Check that we were able to buy the fork - buy_fork_result = wait_for_event_id(user_buys_fork._id) - assert buy_fork_result.result == True - - # Send an event to check if the balance was updated - user_get_balance_node = OpNode(User, InvokeMethod("get_balance"), read_key_from="key") - user_get_balance = Event(user_get_balance_node, {"key": "foo"}, None) - runtime.send(user_get_balance, flush=True) - - # See that the user's balance has gone down - get_balance = wait_for_event_id(user_get_balance._id) - assert get_balance.result == 95 - - # User with key "foo" buys item with key "house" - foo_user.buy_item(house_item) - user_buys_house = Event(df.entry, {"user_key": "foo", "item_key": "house"}, df) - runtime.send(user_buys_house, flush=True) - - # Balance becomes negative when house is bought - buy_house_result = wait_for_event_id(user_buys_house._id) - assert buy_house_result.result == False - - collected_iterator.close() - - print(records) \ No newline at end of file diff --git a/src/cascade/frontend/dataflow_analysis/__init__.py b/tests/integration/flink/__init__.py similarity index 100% rename from src/cascade/frontend/dataflow_analysis/__init__.py rename to tests/integration/flink/__init__.py diff --git a/tests/integration/flink/test_branching.py b/tests/integration/flink/test_branching.py new file mode 100644 index 0000000..62f81f4 --- /dev/null +++ b/tests/integration/flink/test_branching.py @@ -0,0 +1,46 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize_until_if + +from cascade.runtime.flink_runtime import FlinkClientSync +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_branching_pyflink(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime = utils.init_flink_runtime("tests.integration.branching") + client = FlinkClientSync() + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_branching(client, collector) + finally: + collector.close() + client.close() + + +def _test_branching(client, collector): + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch")] + print(branch.to_dot()) + + event = branch.generate_event({"cond_0": True}) + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == 33 + + event = branch.generate_event({"cond_0": False}) + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == 42 \ No newline at end of file diff --git a/tests/integration/flink/test_collect_operator.py b/tests/integration/flink/test_collect_operator.py new file mode 100644 index 0000000..df9b089 --- /dev/null +++ b/tests/integration/flink/test_collect_operator.py @@ -0,0 +1,87 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize, parallelize_until_if + +from cascade.runtime.flink_runtime import FlinkClientSync +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_collect_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime = utils.init_flink_runtime("tests.integration.common") + + client = FlinkClientSync() + + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + + df_parallel = parallelize(user_buy_2) + df_parallel.name = "buy_2_parallel" + cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel + print(df_parallel.to_dot()) + runtime.add_dataflow(df_parallel) + assert len(df_parallel.entry) == 2 + + collector = runtime.run(run_async=True, output="collect") + + assert isinstance(collector, CloseableIterator) + + try: + _test_collect_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_collect_operator(client, collector): + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + df_parallel = cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] + + + event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + + event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_init.generate_event({"key": "spoon", "price": 3}, key="spoon") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + + + # Buy a fork and spoon + print("sending buy 2") + print(df_parallel.to_dot()) + event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + print(event) + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == True + + + # Check the balance + event = user_get_balance.generate_event({}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == (100 - 5 - 3) + + diff --git a/tests/integration/flink/test_operators.py b/tests/integration/flink/test_operators.py new file mode 100644 index 0000000..d88a73b --- /dev/null +++ b/tests/integration/flink/test_operators.py @@ -0,0 +1,100 @@ +"""A test script for dataflows with merge operators""" + +from pyflink.datastream.data_stream import CloseableIterator +from cascade.dataflow.dataflow import DataflowRef, Event +from cascade.runtime.flink_runtime import FlinkClientSync +import tests.integration.flink.utils as utils +from tests.integration.flink.utils import wait_for_event_id + +import pytest + +import cascade +import logging + +@pytest.mark.integration +def test_stateful_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime = utils.init_flink_runtime("tests.integration.common") + client = FlinkClientSync() + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_stateful_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_stateful_operator(client, collector): + + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + + event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result) + + event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_init.generate_event({"key": "spoon", "price": 3}, key="spoon") + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + print(result.result) + + + print(user_buy_2.to_dot()) + + # Buy a fork and spoon + event = user_buy_2.generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == True + + + # Check the balance + event = user_get_balance.generate_event({}, key="foo") + client.send(event) + result = wait_for_event_id(event[0]._id, collector) + assert result.result == (100 - 5 - 3) + + +@pytest.mark.integration +def test_stateless_operator(): + logger = logging.getLogger("cascade") + logger.setLevel("DEBUG") + + utils.create_topics() + + runtime = utils.init_flink_runtime("tests.integration.stateless") + client = FlinkClientSync() + collector = runtime.run(run_async=True, output="collect") + assert isinstance(collector, CloseableIterator) + + try: + _test_stateless_operator(client, collector) + finally: + collector.close() + client.close() + + +def _test_stateless_operator(client, collector): + user_op = cascade.core.operators["SomeStatelessOp"] + event = cascade.core.dataflows[DataflowRef("SomeStatelessOp", "get")].generate_event({}) + client.send(event) + + result = wait_for_event_id(event[0]._id, collector) + assert result.result == 42 \ No newline at end of file diff --git a/tests/integration/flink/utils.py b/tests/integration/flink/utils.py new file mode 100644 index 0000000..5f0f5c4 --- /dev/null +++ b/tests/integration/flink/utils.py @@ -0,0 +1,77 @@ +import cascade +from cascade.dataflow.dataflow import EventResult +from cascade.dataflow.operator import StatefulOperator, StatelessOperator +from cascade.runtime.flink_runtime import FlinkClientSync, FlinkRuntime +from confluent_kafka.admin import AdminClient, NewTopic +from pyflink.datastream.data_stream import CloseableIterator + + +KAFKA_BROKER = "localhost:9092" + +IN_TOPIC = "input-topic" +OUT_TOPIC = "output-topic" +INTERNAL_TOPIC = "internal-topic" + +def wait_for_event_id(id: int, collector: CloseableIterator) -> EventResult: + for record in collector: + print(f"Collected record: {record}") + if record.event_id == id: + return record + + +def init_cascade_from_module(import_path: str): + cascade.core.clear() + exec(f'import {import_path}') + cascade.core.init() + +def init_flink_runtime(import_path: str, in_topic=None, out_topic=None, internal_topic=None, parallelism=4, **init_args) -> FlinkRuntime: + init_cascade_from_module(import_path) + + if in_topic is None: + in_topic = IN_TOPIC + if out_topic is None: + out_topic = OUT_TOPIC + if internal_topic is None: + internal_topic = INTERNAL_TOPIC + + runtime = FlinkRuntime(in_topic, out_topic, internal_topic=internal_topic) + + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.init(parallelism=parallelism, **init_args) + return runtime + +def create_topics(*required_topics): + if len(required_topics) == 0: + required_topics = (IN_TOPIC, OUT_TOPIC, INTERNAL_TOPIC) + + conf = { + "bootstrap.servers": KAFKA_BROKER + } + + admin_client = AdminClient(conf) + + # Define new topics (default: 1 partition, replication factor 1) + new_topics = [NewTopic(topic, num_partitions=32, replication_factor=1) for topic in required_topics] + + # Delete topics + futures = admin_client.delete_topics(list(required_topics)) + for topic, future in futures.items(): + try: + future.result() # Block until the operation is complete + print(f"Topic '{topic}' deleted successfully") + except Exception as e: + print(f"Failed to delete topic '{topic}': {e}") + + # Create topics + futures = admin_client.create_topics(new_topics) + for topic, future in futures.items(): + try: + future.result() # Block until the operation is complete + print(f"Topic '{topic}' recreated successfully") + except Exception as e: + print(f"Failed to create topic '{topic}': {e}") \ No newline at end of file diff --git a/tests/integration/pyruntime/checkout_item.py b/tests/integration/pyruntime/checkout_item.py new file mode 100644 index 0000000..becb3fc --- /dev/null +++ b/tests/integration/pyruntime/checkout_item.py @@ -0,0 +1,28 @@ +import cascade + +@cascade.cascade +class User: + def __init__(self, username: str, balance: int): + self.username = username + self.balance = balance + + def buy_item(self, item: 'Item') -> bool: + item_price = item.get_price() # SSA + self.balance = self.balance - item_price + return self.balance >= 0 + + def __key__(self) -> str: + return self.username + +@cascade.cascade +class Item: + def __init__(self, item_name: str, price: int): + self.item_name = item_name + self.price = price + + def get_price(self) -> int: + return self.price + + def __key__(self) -> str: + return self.item_name + \ No newline at end of file diff --git a/tests/integration/pyruntime/operator_chaining.py b/tests/integration/pyruntime/operator_chaining.py new file mode 100644 index 0000000..4001007 --- /dev/null +++ b/tests/integration/pyruntime/operator_chaining.py @@ -0,0 +1,40 @@ +import cascade + +@cascade.cascade +class C: + def __init__(self, key: str): + self.key = key + + def get(self, y: int) -> int: + test = 42 + y + return test + + def __key__(self) -> str: + return self.key + +@cascade.cascade +class B: + def __init__(self, key: str): + self.key = key + + def call_c(self, c: C) -> int: + y = 0 + x = c.get(y) + return x + + def __key__(self) -> str: + return self.key + +@cascade.cascade +class A: + def __init__(self, key: str): + self.key = key + + def call_c_thru_b(self, b: B, c: C) -> int: + x = b.call_c(c) + return x*2 + + def __key__(self) -> str: + return self.key + + \ No newline at end of file diff --git a/tests/integration/pyruntime/test_programs.py b/tests/integration/pyruntime/test_programs.py new file mode 100644 index 0000000..d1fbd91 --- /dev/null +++ b/tests/integration/pyruntime/test_programs.py @@ -0,0 +1,168 @@ + +import cascade + +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize +from tests.integration.pyruntime.utils import init_python_runtime + + +def test_checkout_item(): + file_name = "tests.integration.pyruntime.checkout_item" + + runtime, client = init_python_runtime(file_name) + item_op = cascade.core.operators["Item"] + user_op = cascade.core.operators["User"] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_buy_item = cascade.core.dataflows[DataflowRef("User", "buy_item")] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + item_get_price = cascade.core.dataflows[DataflowRef("Item", "get_price")] + + + event = item_init.generate_event({"item_name": "fork", "price": 10}, key="fork") + result = client.send(event) + assert result["price"] == 10 + assert result["item_name"] == "fork" + + event = item_init.generate_event({"item_name": "spoon", "price": 20}, key="spoon") + result = client.send(event) + assert result["price"] == 20 + assert result["item_name"] == "spoon" + + print(list(item_get_price.blocks.values())[0].function_string) + + event = item_get_price.generate_event({}, key="spoon") + result = client.send(event) + assert result == 20 + + event = item_get_price.generate_event({}, key="fork") + result = client.send(event) + assert result == 10 + + event = user_init.generate_event({"username": "test", "balance": 15}, key="test") + user = client.send(event) + assert user["balance"] == 15 + assert user["username"] == "test" + + print(user_buy_item.to_dot()) + event = user_buy_item.generate_event({"item_0": "fork"}, key=user["username"] ) + result = client.send(event) + assert runtime.statefuloperators["User"].states["test"]["balance"] == 5 + assert result + + # event = user_buy_item.generate_event({"item_0": "spoon"}, key=user["username"] ) + # result = client.send(event) + # assert runtime.statefuloperators["User"].states["test"]["balance"] == -15 + # assert not result + +def test_operator_chaining(): + file_name = "tests.integration.pyruntime.operator_chaining" + + runtime, client = init_python_runtime(file_name) + a_op = cascade.core.operators["A"] + b_op = cascade.core.operators["B"] + c_op = cascade.core.operators["C"] + a_init = cascade.core.dataflows[DataflowRef("A", "__init__")] + b_init = cascade.core.dataflows[DataflowRef("B", "__init__")] + c_init = cascade.core.dataflows[DataflowRef("C", "__init__")] + c_get = cascade.core.dataflows[DataflowRef("C", "get")] + b_call_c = cascade.core.dataflows[DataflowRef("B", "call_c")] + a_call_c = cascade.core.dataflows[DataflowRef("A", "call_c_thru_b")] + + event = a_init.generate_event({"key": "aaa"}, key="aaa") + result = client.send(event) + assert result["key"] == "aaa" + + event = b_init.generate_event({"key": "bbb"}, key="bbb") + result = client.send(event) + assert result["key"] == "bbb" + + event = c_init.generate_event({"key": "ccc"}, key="ccc") + result = client.send(event) + assert result["key"] == "ccc" + + event = c_get.generate_event({"y_0": 0}, key="ccc") + result = client.send(event) + assert result == 42 + + print("Call C") + event = b_call_c.generate_event({ "c_0": "ccc"}, key="bbb") + print(event) + result = client.send(event) + assert result == 42 + + print("call C thru B") + event = a_call_c.generate_event({"b_0": "bbb", "c_0": "ccc"}, key="aaa") + result = client.send(event) + assert result == 84 + + +def test_branching_integration(): + file_name = "tests.integration.branching" + + runtime, client = init_python_runtime(file_name) + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch")] + print(branch.to_dot()) + + event = branch.generate_event({"cond_0": False}) + result = client.send(event) + assert result == 42 + + event = branch.generate_event({"cond_0": True}) + result = client.send(event) + assert result == 33 + + branch = cascade.core.dataflows[DataflowRef("Brancher", "branch_insta")] + print(branch.to_dot()) + + event = branch.generate_event({"cond_0": True}) + result = client.send(event) + assert result == 33 + + event = branch.generate_event({"cond_0": False}) + result = client.send(event) + assert result == 42 + +def test_collect_with_return(): + file_name = "tests.integration.common" + + runtime, client = init_python_runtime(file_name) + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + + df_parallel = parallelize(user_buy_2) + df_parallel.name = "buy_2_parallel" + cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] = df_parallel + print(df_parallel.to_dot()) + assert len(df_parallel.entry) == 2 + + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + + user_buy_2 = cascade.core.dataflows[DataflowRef("User", "buy_2_items")] + print(user_buy_2.to_dot()) + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + user_get_balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + df_parallel = cascade.core.dataflows[DataflowRef("User", "buy_2_parallel")] + + event = user_init.generate_event({"key": "foo", "balance": 100}, key="foo") + result = client.send(event) + + + event = item_init.generate_event({"key": "fork", "price": 5}, key="fork") + client.send(event) + + event = item_init.generate_event({"key": "spoon", "price": 3}, key="spoon") + result = client.send(event) + + + # Buy a fork and spoon + print("sending buy 2") + event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}, key="foo") + print(event) + result = client.send(event) + assert result == True + + # Check the balance + event = user_get_balance.generate_event({}, key="foo") + result = client.send(event) + assert result == (100 - 5 - 3) \ No newline at end of file diff --git a/tests/programs/util.py b/tests/integration/pyruntime/utils.py similarity index 83% rename from tests/programs/util.py rename to tests/integration/pyruntime/utils.py index 0312d40..a4261f8 100644 --- a/tests/programs/util.py +++ b/tests/integration/pyruntime/utils.py @@ -3,8 +3,24 @@ import ast import difflib -import importlib +import cascade +from cascade.dataflow.operator import StatefulOperator, StatelessOperator +from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +def init_python_runtime(import_module_name: str) -> tuple[PythonRuntime, PythonClientSync]: + cascade.core.clear() + exec(f'import {import_module_name}') + cascade.core.init() + + runtime = PythonRuntime() + for op in cascade.core.operators.values(): + if isinstance(op, StatefulOperator): + runtime.add_operator(op) + elif isinstance(op, StatelessOperator): + runtime.add_stateless_operator(op) + + runtime.run() + return runtime, PythonClientSync(runtime) # colors red = lambda text: f"\033[38;2;255;0;0m{text}\033[38;2;255;255;255m" diff --git a/tests/integration/stateless.py b/tests/integration/stateless.py new file mode 100644 index 0000000..2af9507 --- /dev/null +++ b/tests/integration/stateless.py @@ -0,0 +1,8 @@ +import cascade + + +@cascade.cascade +class SomeStatelessOp: + @staticmethod + def get() -> int: + return 42 \ No newline at end of file diff --git a/tests/integration/test_single_entity.py b/tests/integration/test_single_entity.py deleted file mode 100644 index d4e77d3..0000000 --- a/tests/integration/test_single_entity.py +++ /dev/null @@ -1,19 +0,0 @@ -# todo: annotate with @cascade.entity -class User: - def __init__(self, key: str, balance: int): - self.key = key - self.balance = balance - - def set_balance(self, balance: int): - self.balance = balance - - def get_balance(self) -> int: - return self.balance - - -def test_single_entity(): - user = User("user", 100) - assert user.get_balance() == 100 - - user.set_balance(10) - assert user.get_balance() == 10 \ No newline at end of file diff --git a/tests/optimizations/code_motion_entities.py b/tests/optimizations/code_motion_entities.py new file mode 100644 index 0000000..49ceb69 --- /dev/null +++ b/tests/optimizations/code_motion_entities.py @@ -0,0 +1,36 @@ +from cascade import cascade + +@cascade +class Item: + def __init__(self, item: str, quantity: int, price: int): + self.item = item + self.quantity = quantity + self.price = price + + def get_quantity(self): + return self.quantity + + def get_price(self): + return self.price + + + +@cascade +class User: + def __init__(self, balance: int): + self.balance = balance + + def checkout_item(self, item: Item): + stock = item.get_quantity() + in_stock = stock > 0 + price = item.get_price() + can_buy = price <= self.balance + condition = in_stock and can_buy + if condition: + self.balance = self.balance - price + return True + else: + return False + + def get_balance(self) -> int: + return self.balance \ No newline at end of file diff --git a/tests/optimizations/deathstar_entities.py b/tests/optimizations/deathstar_entities.py new file mode 100644 index 0000000..dc43e58 --- /dev/null +++ b/tests/optimizations/deathstar_entities.py @@ -0,0 +1,42 @@ +from cascade import cascade + +@cascade +class ComposeReview: + def __init__(self, req_id: str, **kwargs): # **args is a temporary hack to allow for creation of composereview on the fly + self.req_id = req_id + self.review_data = {} + + def upload_unique_id(self, review_id: int): + self.review_data["review_id"] = review_id + + # could use the User class instead? + def upload_user_id(self, user_id: str): + self.review_data["userId"] = user_id + + def upload_movie_id(self, movie_id: str): + self.review_data["movieId"] = movie_id + + def upload_rating(self, rating: int): + self.review_data["rating"] = rating + + def upload_text(self, text: str): + self.review_data["text"] = text + + def get_data(self): + x = self.review_data + return x + +@cascade +class MovieId: + # key: 'title' + def __init__(self, title: str, movie_id: str): + self.title = title + self.movie_id = movie_id + + def upload_movie_prefetch(self, review: ComposeReview, rating: int): + cond = rating is not None + movie_id = self.movie_id + review.upload_rating(rating) + review.upload_movie_id(movie_id) + return cond + \ No newline at end of file diff --git a/tests/optimizations/entities.py b/tests/optimizations/entities.py new file mode 100644 index 0000000..d325f6e --- /dev/null +++ b/tests/optimizations/entities.py @@ -0,0 +1,25 @@ +from cascade import cascade + +@cascade +class Stock: + def __init__(self, item: str, quantity: int): + self.item = item + self.quantity = quantity + + def get_quantity(self): + return self.quantity + +@cascade +class Adder: + @staticmethod + def add(a, b): + return a + b + +@cascade +class Test: + @staticmethod + def get_total(item1: Stock, item2: Stock): + x = item1.get_quantity() + y = item2.get_quantity() + total_adder = Adder.add(x, y) + return total_adder \ No newline at end of file diff --git a/tests/optimizations/test_parallelize.py b/tests/optimizations/test_parallelize.py new file mode 100644 index 0000000..e5fd833 --- /dev/null +++ b/tests/optimizations/test_parallelize.py @@ -0,0 +1,214 @@ + +import os +import sys + + +# import cascade +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../src"))) + +from cascade.dataflow.dataflow import DataflowRef +from cascade.dataflow.optimization.parallelization import parallelize, parallelize_until_if +from cascade.runtime.python_runtime import PythonClientSync, PythonRuntime +import cascade + +def test_parallelize(): + cascade.core.clear() # clear cascadeds registerd classes. + assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ + Module" + # import the module + import_module_name: str = 'entities' + exec(f'import tests.optimizations.{import_module_name}') + + cascade.core.init() + + print(cascade.core.operators) + test_op = cascade.core.operators["Test"] + adder_op = cascade.core.operators["Adder"] + stock_op = cascade.core.operators["Stock"] + stock_init = cascade.core.dataflows[DataflowRef("Stock", "__init__")] + df = cascade.core.dataflows[DataflowRef("Test", "get_total")] + print(df) + print(df.nodes) + + print(df.to_dot()) + df_parallel, _ = parallelize_until_if(df) + df_parallel.name = "get_total_parallel" + cascade.core.dataflows[DataflowRef("Test", "get_total_parallel")] = df_parallel + + print(df_parallel.to_dot()) + + assert len(df_parallel.entry) == 2 + assert len(df.entry) == 1 + + runtime = PythonRuntime() + runtime.add_stateless_operator(test_op) + runtime.add_stateless_operator(adder_op) + runtime.add_operator(stock_op) + runtime.run() + + client = PythonClientSync(runtime) + + event = stock_init.generate_event({"item": "fork", "quantity": 10}, key="fork") + result = client.send(event) + + + event = stock_init.generate_event({"item": "spoon", "quantity": 20}, key="spoon") + result = client.send(event) + + event = df.generate_event({"item1_0": "fork", "item2_0": "spoon"}) + result = client.send(event) + assert result == 30 + + event = df_parallel.generate_event({"item1_0": "fork", "item2_0": "spoon"}) + result = client.send(event) + assert result == 30 + +def gen_parallel(df): + par, rest = parallelize_until_if(df) + + # join the two dataflows + par_exit = [node.id for node in par.nodes.values() if len(node.outgoing_edges) == 0] + for edge in rest.edges: + par.add_edge(edge) + assert len(rest.entry) == 1 + assert len(par_exit) == 1 + par.add_edge_refs(par_exit[0], rest.entry[0].id, None) + + + print(par.to_dot()) + par.name = df.name + "_parallel" + return par + +def test_code_motion(): + cascade.core.clear() # clear cascadeds registerd classes. + assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ + Module" + # import the module + import_module_name: str = 'code_motion_entities' + exec(f'import tests.optimizations.{import_module_name}') + + cascade.core.init() + + print(cascade.core.operators) + user_op = cascade.core.operators["User"] + item_op = cascade.core.operators["Item"] + item_init = cascade.core.dataflows[DataflowRef("Item", "__init__")] + user_init = cascade.core.dataflows[DataflowRef("User", "__init__")] + checkout = cascade.core.dataflows[DataflowRef("User", "checkout_item")] + balance = cascade.core.dataflows[DataflowRef("User", "get_balance")] + + checkout_parallel = gen_parallel(checkout) + print(checkout.to_dot()) + cascade.core.dataflows[DataflowRef("User", "checkout_item_parallel")] = checkout_parallel + + print(checkout_parallel.to_dot()) + + assert len(checkout_parallel.entry) == 2 + assert len(checkout.entry) == 1 + + runtime = PythonRuntime() + runtime.add_operator(item_op) + runtime.add_operator(user_op) + runtime.run() + + client = PythonClientSync(runtime) + + event = item_init.generate_event({"item": "fork", "quantity": 10, "price": 10}, key="fork") + result = client.send(event) + print(result) + + event = item_init.generate_event({"item": "spoon", "quantity": 0, "price": 10}, key="spoon") + result = client.send(event) + print(result) + + event = item_init.generate_event({"item": "knife", "quantity": 10, "price": 100}, key="knife") + result = client.send(event) + print(result) + + event = user_init.generate_event({"balance": 50}, key="user") + result = client.send(event) + + + # buy spoon fails + event = checkout.generate_event({"item_0": "spoon"}, key="user") + result = client.send(event) + assert not result + + event = checkout_parallel.generate_event({"item_0": "spoon"}, key="user") + result = client.send(event) + assert not result + + + # buy knife fails + event = checkout.generate_event({"item_0": "knife"}, key="user") + result = client.send(event) + assert not result + + event = checkout_parallel.generate_event({"item_0": "knife"}, key="user") + result = client.send(event) + assert not result + + + # buy fork works! + event = checkout.generate_event({"item_0": "fork"}, key="user") + result = client.send(event) + assert result + + event = checkout_parallel.generate_event({"item_0": "fork"}, key="user") + result = client.send(event) + assert result + + event = balance.generate_event({}, key="user") + result = client.send(event) + assert result == 30 + + + +def test_a(): + cascade.core.clear() # clear cascadeds registerd classes. + assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ + Module" + # import the module + import_module_name: str = 'deathstar_entities' + exec(f'import tests.optimizations.{import_module_name}') + + cascade.core.init() + + prefetch = cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch")] + compose_init = cascade.core.dataflows[DataflowRef("ComposeReview", "__init__")] + movie_init = cascade.core.dataflows[DataflowRef("MovieId", "__init__")] + + print(prefetch.to_dot()) + prefetch_parallel = parallelize(prefetch) + print(prefetch_parallel.to_dot()) + cascade.core.dataflows[DataflowRef("MovieId", "upload_movie_prefetch_parallel")] = prefetch_parallel + + compose_op = cascade.core.operators["ComposeReview"] + movie_op = cascade.core.operators["MovieId"] + + + runtime = PythonRuntime() + runtime.add_operator(compose_op) + runtime.add_operator(movie_op) + runtime.run() + client = PythonClientSync(runtime) + + + + e = compose_init.generate_event({"req_id": "1"}, key="1") + r = client.send(e) + print(r) + + e = movie_init.generate_event({"title": "cars", "movie_id": 1}, key="cars") + r = client.send(e) + print(r) + + print("---") + e = prefetch.generate_event({"review_0": "1", "rating_0": 2}, key="cars") + r = client.send(e) + print(r) + + print("---") + e = prefetch_parallel.generate_event({"review_0": "1", "rating_0": 2}, key="cars") + r = client.send(e) + print(r) \ No newline at end of file diff --git a/tests/programs/README.md b/tests/programs/README.md deleted file mode 100644 index 7c63d04..0000000 --- a/tests/programs/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Test programs -`test_programs.py` scans files in ./target folder, compiles them and tests them to expected. \ No newline at end of file diff --git a/tests/programs/__init__.py b/tests/programs/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/programs/test_programs.py b/tests/programs/test_programs.py deleted file mode 100644 index 120597e..0000000 --- a/tests/programs/test_programs.py +++ /dev/null @@ -1,37 +0,0 @@ -import os - -import pytest -import cascade -import sys - - -from tests.programs.util import compare_targets_with_expected - - -target_program_relative_path: str = 'test_programs/target' -expected_program_relative_path: str = 'test_programs/expected' - - -def get_target_file_list(): - target_files: list[str] = os.listdir(target_program_relative_path) - return list(filter(lambda f: f.endswith('.py') and '__init__' not in f, target_files)) - -target_files: list[str] = get_target_file_list() - -@pytest.mark.parametrize("file_name", target_files) -def test_target_programs(file_name: str): - for key in list(sys.modules.keys()): - if key.startswith("test_programs"): - del sys.modules[key] - - cascade.core.clear() # clear cascadeds registerd classes. - assert not cascade.core.registered_classes, "Registered classes should be empty before importing a Cascade \ - Module" - # import the module - import_module_name: str = f'test_programs.target.{file_name.strip(".py")}' - exec(f'import {import_module_name}') - - cascade.core.init() - assert cascade.core.registered_classes, "The Cascade module classes should be registered at this point." - methods: str = cascade.core.get_compiled_methods() - compare_targets_with_expected(file_name, methods, expected_program_relative_path)