llnl · spencer-gre · Jul 17, 2023 · Jul 24, 2023 · Jul 19, 2023 · Jul 24, 2023
diff --git a/docs/examples/read/perfflowaspect.py b/docs/examples/read/perfflowaspect.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+#
+# Copyright 2017-2023 Lawrence Livermore National Security, LLC and other
+# Hatchet Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: MIT
+
+import hatchet as ht
+
+
+if __name__ == "__main__":
+    # pfa_file = "../../../hatchet/tests/data/perfflowaspect-smoketests/array_compact.pfw"
+    # pfa_file = "../../../hatchet/tests/data/perfflowaspect-smoketests/array_verbose.pfw"
+    # pfa_file = "../../../hatchet/tests/data/perfflowaspect-smoketests/object_compact_adiak.pfw"
+    pfa_file = "../../../hatchet/tests/data/perfflowaspect-smoketests/object_verbose_adiak.pfw"
+
+    gf = ht.GraphFrame.from_perfflowaspect(pfa_file)
+
+    # Printout the DataFrame component of the GraphFrame.
+    print(gf.dataframe)
+    print(gf.metadata)
+
+    print(len(gf.graph.roots))
+
+    for i, node in enumerate(gf.graph.traverse()):
+        print(node._hatchet_nid, node, list(node.parents), list(node.children))
+
+    # Printout the graph component of the GraphFrame.
+    # Use "ts" as the metric column to be displayed
+    print(gf.tree(metric_column=["dur"]))
+    # print(gf.tree(metric_column="ts"))
diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py
@@ -395,6 +395,12 @@ def from_hdf(filename, **kwargs):
                 )
             raise ve
 
+    @staticmethod
+    def from_perfflowaspect(filename, scan_memory=False, scan_cpu=False):
+        from .readers.perfflowaspect_reader import PerfFlowAspectReader
+
+        return PerfFlowAspectReader(filename, scan_memory=False, scan_cpu=False).read()
+
     @deprecated(
         "Reading from/writing to HDF5 is deprecated and will be removed in a later version."
     )

diff --git a/hatchet/readers/hpctoolkit_reader_latest.py b/hatchet/readers/hpctoolkit_reader_latest.py
@@ -49,7 +49,6 @@ def read_string(data: bytes, offset: int) -> str:
 
 
 class HPCToolkitReaderLatest:
-
     def __init__(
         self,
         dir_path: str,
@@ -287,15 +286,15 @@ def _parse_context(
 
                 elif lexicalType == 3:
                     (pModule, offset) = safe_unpack("<QQ", meta_db, flex_offset)
-                    frame["name"] = (
-                        f"{self._parse_load_module(meta_db, pModule)['module_path']}:{offset}"
-                    )
+                    frame[
+                        "name"
+                    ] = f"{self._parse_load_module(meta_db, pModule)['module_path']}:{offset}"
 
                 else:
                     (pFile, line) = safe_unpack("<QL", meta_db, flex_offset)
-                    frame["name"] = (
-                        f"{self._parse_source_file(meta_db, pFile)['file_path']}:{line}"
-                    )
+                    frame[
+                        "name"
+                    ] = f"{self._parse_source_file(meta_db, pFile)['file_path']}:{line}"
 
             node = self._store_cct_node(ctxId, frame, parent, parent._depth + 1)
 
@@ -356,13 +355,13 @@ def _read_summary_profile(
                         ] = value
 
                         if self._metric_descriptions[metricId].endswith("(i)"):
-                            self._inclusive_metrics[metricId] = (
-                                self._metric_descriptions[metricId]
-                            )
+                            self._inclusive_metrics[
+                                metricId
+                            ] = self._metric_descriptions[metricId]
                         else:
-                            self._exclusive_metrics[metricId] = (
-                                self._metric_descriptions[metricId]
-                            )
+                            self._exclusive_metrics[
+                                metricId
+                            ] = self._metric_descriptions[metricId]
 
     def _read_cct(
         self,

diff --git a/hatchet/readers/perfflowaspect_reader.py b/hatchet/readers/perfflowaspect_reader.py
@@ -0,0 +1,191 @@
+import json
+import pandas as pd
+
+import hatchet.graphframe
+from hatchet.node import Node
+from hatchet.graph import Graph
+from hatchet.frame import Frame
+
+
+class PerfFlowAspectReader:
+    """Create a GraphFrame from PerfFlowAspect trace files.
+
+    Return:
+        (GraphFrame): graphframe containing data from dictionaries
+    """
+
+    def __init__(self, filename, scan_memory=False, scan_cpu=False):
+        """
+        filename (str): A path to a PerfFlowAspect trace file.
+        scan_memory (bool): Whether or not to include memory usage statistics
+        scan_cpu (bool): Whether or not to include CPU usage statistics
+        """
+        self.scan_memory = scan_memory
+        self.scan_cpu = scan_cpu
+        with open(filename, "r+") as file:
+            raw = file.read()
+
+            try:
+                data = json.loads(raw)
+            except json.JSONDecodeError:
+                fixed = self._repair_array_json(raw)
+                try:
+                    data = json.loads(fixed)
+                except json.JSONDecodeError as e:
+                    raise ValueError(f"Trace file could not be parsed or repaired: {e}")
+
+            if isinstance(data, dict) and "traceEvents" in data and isinstance(data["traceEvents"], list):
+                obj = data
+                self.displayTimeUnit = obj.get("displayTimeUnit")
+                self.metadata = obj.get("otherData", {})
+                self.spec_dict = obj["traceEvents"]
+            elif isinstance(data, list):
+                self.displayTimeUnit = None
+                self.metadata = {}
+                self.spec_dict = data
+            else:
+                raise ValueError("Trace must be either object or array format")
+
+        # Change verbose output to compact output
+        if self.spec_dict and self.spec_dict[0].get("ph") == "B":
+            stack = []
+            final = []
+            for event in self.spec_dict:
+                ph = event.get("ph")
+
+                if ph == "B":
+                    stack.append(event.copy())
+                elif ph == "E":
+                    if not stack:
+                        continue
+                    start = stack.pop()
+                    merged = start
+                    merged["dur"] = event["ts"] - start["ts"]
+                    merged["ph"] = "X"
+                    final.append(merged)
+            self.spec_dict = final
+
+    def _repair_array_json(self, text):
+        text = text.rstrip()
+        text = text.replace(",\n]", "\n]")
+        if not text.endswith("]"):
+            text += "]"
+        if not text.lstrip().startswith("["):
+            text = "[" + text
+        return text
+
+    def sort(self):
+        # Sort the spec_dict based on the end time (ts + dur) of each function
+        self.spec_dict = sorted(
+            self.spec_dict, key=lambda item: item["ts"] + item["dur"]
+        )
+
+    def read(self):
+        roots = []
+        node_mapping = {}  # Dictionary to keep track of the nodes
+        node_dicts = []
+        usage_pairings = {}  # usage_pairings[ts] = (memory, cpu)
+
+        # Error if attempt is made to retrieve statistics,
+        # but no statistics exist.
+        if all("C" not in item["ph"] for item in self.spec_dict) and (
+            self.scan_cpu or self.scan_memory
+        ):
+            raise ValueError("No statistics in the provided file!")
+
+        for item in self.spec_dict:
+            # the following values always appear in a PerfFlowAspect log
+            name = item["name"]
+            ts = item["ts"] * 1e-6 # convert to seconds
+            ph = item["ph"]
+
+            # these items may or may not appear.
+            dur = None
+            memory = 0
+            cpu = 0
+
+            # If statistic event, get the statistics and match with
+            # the timestamp.
+            if ph == "C":
+                valid_statistic = False
+                if self.scan_memory:
+                    if item["args"]["memory_usage"] != 0:
+                        memory = item["args"]["memory_usage"]
+                        valid_statistic = True
+                if self.scan_cpu:
+                    if item["args"]["cpu_usage"] != 0.0:
+                        cpu = item["args"]["cpu_usage"]
+                        valid_statistic = True
+                if valid_statistic:
+                    usage_pairings[ts] = (memory, cpu)
+                continue
+
+            dur = item["dur"] * 1e-6
+
+            # A Frame always consists of these values
+            frame_values = {"name": name, "type": "function", "ts": ts, "dur": dur}
+
+            # Optionally, if logging statistics, insert memory and cpu usage
+            # into the Frame
+            if self.scan_memory:
+                memory = usage_pairings[ts][0]
+                frame_values["usage_memory"] = memory
+            if self.scan_cpu:
+                cpu = usage_pairings[ts][1]
+                frame_values["usage_cpu"] = cpu
+
+            # Create a Frame and Node for the function
+            # Frame stores information about the node
+            # Node represents a node in the hierarchical graph structure
+            frame = Frame(frame_values)
+            node = Node(frame, parent=None, hnid=-1)
+
+            # check the relationships between node and roots
+            for root in reversed(roots):
+                # if node is a parent of root node
+                if (ts < root.frame["ts"]) and (
+                    ts + dur > root.frame["ts"] + root.frame["dur"]
+                ):
+                    node.add_child(root)
+                    root.add_parent(node)
+                    roots.pop()
+            roots.append(node)
+
+            node_dict_vals = {
+                "node": node,
+                "name": name,
+                "ts": ts,
+                "dur": dur,
+                "pid": item["pid"],
+                "tid": item["tid"],
+                "ph": item["ph"],
+            }
+            if self.scan_memory:
+                node_dict_vals["usage_memory"] = memory
+            if self.scan_cpu:
+                node_dict_vals["usage_cpu"] = cpu
+
+            node_dict = dict(node_dict_vals)
+            node_dicts.append(node_dict)
+
+            # Store the Node object with its name for future reference
+            print("Add", name, "to node map")
+            node_mapping[name] = node
+
+        # Create the Graph object from the root nodes
+        graph = Graph(roots)
+        graph.enumerate_traverse()
+
+        dataframe = pd.DataFrame(data=node_dicts)
+        dataframe.set_index(["node"], inplace=True)
+        dataframe.sort_index(inplace=True)
+
+        exc_metrics = []
+        inc_metrics = []
+        for col in dataframe.columns:
+            if "(inc)" in col:
+                inc_metrics.append(col)
+            else:
+                exc_metrics.append(col)
+
+        return hatchet.graphframe.GraphFrame(graph, dataframe, metadata=self.metadata)
diff --git a/hatchet/readers/timemory_reader.py b/hatchet/readers/timemory_reader.py
@@ -206,9 +206,9 @@ def match_labels_and_values(_metric_stats, _metric_label, _metric_type):
                 # match with metric labels if _metric_stat item is a list.
                 elif isinstance(_item, list):
                     for i in range(len(_item)):
-                        _ret["{}.{}{}".format(_key, _metric_label[i], _metric_type)] = (
-                            _item[i]
-                        )
+                        _ret[
+                            "{}.{}{}".format(_key, _metric_label[i], _metric_type)
+                        ] = _item[i]
                 # check if _metric_stat item is not a dict or list
                 else:
                     _ret["{}.{}{}".format(_key, _metric_label, _metric_type)] = _item