Initial commit of PrismIO (#3)

HenryYihengXu · bhatele · web-flow · commit afbcc55dfd43 · 2021-02-02T17:00:40.000-05:00
* initialized basic project structure. Added basic reader functions and groupby functions of io_frame

Co-authored-by: Abhinav Bhatele &lt;bhatele@cs.umd.edu&gt;
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,10 @@
+# -*- conf -*-
+# flake8 settings for prismio 
+#
+# These are the flake8 settings recommended by Black
+# https://github.com/psf/black/blob/master/docs/the_black_code_style.md
+
+[flake8]
+max-line-length = 80
+select = C,E,F,W,B,B950
+extend-ignore = E203, E501
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,16 @@
+branches:
+  only:
+    - develop
+    - /^releases\/.*$/
+
+jobs:
+  fast_finish: true
+  include:
+    - stage: "style check"
+      python: '3.7'
+      os: linux
+      language: python
+      script:
+        - black --diff --check .
+        - flake8
+        
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2021 Parallel Software and Systems Group, University of Maryland
+Copyright 2020-2021 Parallel Software and Systems Group, University of Maryland
 
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+PrismIO is a Python-based tool that can take in performance data profiled by various apps and struct them to Pandas dataframes. It is intended to provide flexible and convinient api for user to analyze multiple runs.
diff --git a/prismio/io_frame.py b/prismio/io_frame.py
@@ -0,0 +1,84 @@
+# Copyright 2020-2021 Parallel Software and Systems Group, University of
+# Maryland. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: MIT
+
+"""
+The prismio.io_frame module provides the IOFrame class for structured data
+structure and flexible api of tracing/profiling data generated by Recorder
+or Darshan
+"""
+
+
+import sys
+import os
+import numpy as np
+import pandas as pd
+
+class IOFrame:
+    """
+    Main class of the prism application. It holds I/O performance data 
+    generated by I/O tools. It reorganizes the data into a Pandas.DataFrame,
+    which contains useful information such as the start time of functions, 
+    the files functions access to, etc. It also provides flexible api 
+    functions for user to do analysis.
+    """
+    def __init__(self, dataframe):
+        """
+        Args:
+            dataframe (DataFrame): the dataframe this IOFrame should have.
+
+        Return:
+            None.
+
+        """
+        self.dataframe = dataframe
+
+    @staticmethod
+    def from_recorder(log_dir):
+        """
+        Read trace files from recorder and create the corresponding
+        IOFrame object.
+
+        Args:
+            log_dir (str): path to the trace files directory of Recorder the user wants to analyze.
+
+        Return:
+            A IOFrame object corresponding to this trace files directory.
+
+        """
+        from prismio.readers.recorder_reader import RecorderReader
+        return RecorderReader(log_dir).read()
+
+    def filter(self, my_lambda): 
+        """
+        Create a new IOFrame based on the filter function the user provides.
+
+        Args:
+            my_lambda (function): filtering function. For example, np.sum, np.sort.
+
+        Return:
+            A new IOFrame object with a new filtered dataframe.
+
+        """
+        dataframe = self.dataframe[self.dataframe.apply(my_lambda, axis = 1)]
+        dataframe = dataframe.reset_index()
+        dataframe = dataframe.drop('index', axis=1)
+        return IOFrame(dataframe)
+
+    def groupby_aggregate(self, groupby_columns, agg_function):
+        """
+        Return a dataframe after groupby and aggregate operations on the dataframe of this IOFrame.
+
+        Args:
+            groupby_columns (list of strings): the column names the user want to groupby.
+            agg_function (function or string): the function used for aggregation. For example, 'sum', 'count', etc.
+
+        Return:
+            A dataframe after groupby and aggregate operations on the dataframe of this IOFrame.
+
+        """
+        groupby_obj = self.dataframe.groupby(groupby_columns)
+        agg_dataframe = groupby_obj.agg(agg_function)
+        return agg_dataframe
+    
diff --git a/prismio/multi_io_frame.py b/prismio/multi_io_frame.py
@@ -0,0 +1,39 @@
+# Copyright 2020-2021 Parallel Software and Systems Group, University of
+# Maryland. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: MIT
+
+"""
+The prismio.multi_io_frame module aims to provide functionalities of analysing
+and comparing multiple runs. 
+
+"""
+
+
+import sys
+import glob
+import numpy as np
+import pandas as pd
+from prismio.io_frame import IOFrame
+
+class MultiIOFrame():
+    
+    def __init__(self, directories):
+        """
+        Args:
+            directories (list or str): a list of tracing directories or a root directory that contains tracing directories.
+
+        Return:
+            None.
+
+        """
+        if type(directories) is not list and type(directories) is not str:
+            sys.stderr.write("error: please pass in a root directory or a list of tracing directories\n")
+            return(-1)
+        
+        if type(directories) is str:
+            directories = glob.glob(directories + "/*")
+
+        self.ioframes = {}
+        for directory in directories:
+            self.ioframes[directory] = IOFrame.from_recorder(directory)
diff --git a/prismio/readers/recorder_reader.py b/prismio/readers/recorder_reader.py
@@ -0,0 +1,121 @@
+# Copyright 2020-2021 Parallel Software and Systems Group, University of
+# Maryland. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: MIT
+
+"""
+The prismio.reader.recorder_reader module provides functions for processing tracing data
+from Recorder, for example, sorting records, finding the file name each record operates.
+With data processing, it organize the data to a dataframe, and create the IOFrame for
+recorder tracing files.
+
+"""
+
+
+import sys
+import os
+from csv import writer
+import numpy as np
+import pandas as pd
+from prismio.io_frame import IOFrame
+import recorder_viz
+
+class RecorderReader:
+    """
+    The reader class for recorder data. It can read in recorder trace files, 
+    preprocess the data, and create a corresponding IOFrame.
+    """
+    def __init__(self, log_dir):
+        """
+        Use the Recorder creader_wrapper to read in tracing data.
+
+        Args:
+            log_dir (string): path to the trace files directory of Recorder the user wants to analyze.
+
+        Return:
+            None.
+
+        """
+        self.reader = recorder_viz.RecorderReader(log_dir)    
+        
+    def read(self):
+        """
+        Call sort_records and then find_filenames. After it has all information needed,
+        it creates the dataframe row by row. Then create an IOFrame with this dataframe. 
+
+        Args:
+            None.
+
+        Return:
+            An IOFrame created by trace files of recorder specified by the log_dir of this RecorderReader.
+
+        """
+        all_records = []
+        for rank in range(self.reader.GM.total_ranks):
+            per_rank_records = []
+            for record_index in range(self.reader.LMs[rank].total_records):
+                per_rank_records.append(self.reader.records[rank][record_index])
+            per_rank_records = sorted(per_rank_records, key=lambda x: x.tstart)
+            all_records.append(per_rank_records)
+
+        records_as_dict = {
+            'rank': [], 
+            'fid': [], 
+            'name': [], 
+            'tstart': [],
+            'tend': [],
+            'time': [], 
+            'arg_count': [],
+            'args': [],
+            'return_value': [],
+            'file': []
+        }
+
+        fd_to_filenames = [{0: "stdin", 1: "stdout", 2: "stderr"}] * self.reader.GM.total_ranks
+        
+        for rank in range(self.reader.GM.total_ranks):
+            for record in all_records[rank]:
+                fd_to_filename = fd_to_filenames[rank]
+                function_args = record.args_to_strs()
+                func_name = self.reader.funcs[record.func_id]
+                if 'fdopen' in func_name:
+                    fd = record.res
+                    old_fd = int(function_args[0])
+                    if old_fd not in fd_to_filename: 
+                        filename = '__unknown__'
+                    else:
+                        filename = fd_to_filename[old_fd]
+                        fd_to_filename[fd] = filename
+                elif 'fopen' in func_name or 'open' in func_name:
+                    fd = record.res
+                    filename = function_args[0]
+                    fd_to_filename[fd] = filename
+                elif 'fwrite' in func_name or 'fread' in func_name:
+                    fd = int(function_args[3])
+                    if fd not in fd_to_filename:
+                        filename = '__unknown__'
+                    else: 
+                        filename = fd_to_filename[fd]
+                elif 'seek' in func_name or 'close' in func_name or 'sync' in func_name or 'writev' in func_name or 'readv' in func_name or 'pwrite' in func_name or 'pread' in func_name or 'write' in func_name or 'read' in func_name or 'fprintf' in func_name:
+                    fd = int(function_args[0])
+                    if fd not in fd_to_filename:
+                        filename = '__unknown__'
+                    else: 
+                        filename = fd_to_filename[fd]
+                else:
+                    filename = None    
+
+                records_as_dict['rank'].append(rank)
+                records_as_dict['fid'].append(record.func_id)
+                records_as_dict['name'].append(func_name)
+                records_as_dict['tstart'].append(record.tstart)
+                records_as_dict['tend'].append(record.tend)
+                records_as_dict['time'].append(record.tend - record.tstart)
+                records_as_dict['arg_count'].append(record.arg_count)
+                records_as_dict['args'].append(function_args)
+                records_as_dict['return_value'].append(record.res)
+                records_as_dict['file'].append(filename) 
+
+        dataframe = pd.DataFrame.from_dict(records_as_dict)
+
+        return IOFrame(dataframe)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright 2021 Parallel Software and Systems Group, University of Maryland`
	`1`	`+Copyright 2020-2021 Parallel Software and Systems Group, University of Maryland`
`2`	`2`
`3`	`3`	`Permission is hereby granted, free of charge, to any person obtaining a`
`4`	`4`	`copy of this software and associated documentation files (the "Software"),`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+PrismIO is a Python-based tool that can take in performance data profiled by various apps and struct them to Pandas dataframes. It is intended to provide flexible and convinient api for user to analyze multiple runs.`