From f568d6ee77b00d83e1119d9c875b9cf210da72e0 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Tue, 20 Apr 2021 14:08:37 -0400 Subject: [PATCH 1/9] Adds Pickle reader/writer --- hatchet/graphframe.py | 11 +++++++++++ hatchet/readers/pickle_reader.py | 16 ++++++++++++++++ hatchet/writers/pickle_writer.py | 17 +++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 hatchet/readers/pickle_reader.py create mode 100644 hatchet/writers/pickle_writer.py diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py index 009fb83d..4a1faef8 100644 --- a/hatchet/graphframe.py +++ b/hatchet/graphframe.py @@ -271,6 +271,17 @@ def to_hdf(self, filename, key="hatchet_graphframe", **kwargs): HDF5Writer(filename).write(self, key=key, **kwargs) + @staticmethod + def from_pickle(filename, **kwargs): + from .readers.pickle_reader import PickleReader + + return PickleReader(filename).read(**kwargs) + + def to_pickle(filename, **kwargs): + from .writers.pickle_writer import PickleWriter + + PickleWriter(filename).write(**kwargs) + def copy(self): """Return a shallow copy of the graphframe. diff --git a/hatchet/readers/pickle_reader.py b/hatchet/readers/pickle_reader.py new file mode 100644 index 00000000..627807b8 --- /dev/null +++ b/hatchet/readers/pickle_reader.py @@ -0,0 +1,16 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +import pandas as pd +from .pandas_reader import PandasReader + + +class PickleReader(PandasReader): + def __init__(self, filename): + # TODO Remove Arguments when Python 2.7 support is dropped + super(HDF5Reader, self).__init__(filename) + + def _read_from_file_type(self, **kwargs): + return pd.read_pickle(self.fname, **kwargs) diff --git a/hatchet/writers/pickle_writer.py b/hatchet/writers/pickle_writer.py new file mode 100644 index 00000000..abf1e152 --- /dev/null +++ b/hatchet/writers/pickle_writer.py @@ -0,0 +1,17 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +import warnings + +from .pandas_writer import PandasWriter + + +class PickleWriter(PandasWriter): + def __init__(self, filename): + # TODO Remove Arguments when Python 2.7 support is dropped + super(HDF5Writer, self).__init__(filename) + + def _write_to_file_type(self, df, **kwargs): + df.to_pickle(self.fname, **kwargs) From 06eba71cbe81b861d8b6ba196ddb37f8b0d1488f Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Tue, 20 Apr 2021 15:14:51 -0400 Subject: [PATCH 2/9] Creates Readers and Writers for CSV and Excel for checkpointing --- hatchet/graphframe.py | 143 ++++++++++++++++++++++++++++++- hatchet/readers/csv_reader.py | 16 ++++ hatchet/readers/excel_reader.py | 16 ++++ hatchet/readers/pickle_reader.py | 2 +- hatchet/tests/graphframe.py | 117 +++++++++++++++++++++++++ hatchet/writers/csv_writer.py | 15 ++++ hatchet/writers/excel_writer.py | 15 ++++ hatchet/writers/pickle_writer.py | 4 +- 8 files changed, 321 insertions(+), 7 deletions(-) create mode 100644 hatchet/readers/csv_reader.py create mode 100644 hatchet/readers/excel_reader.py create mode 100644 hatchet/writers/csv_writer.py create mode 100644 hatchet/writers/excel_writer.py diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py index 4a1faef8..fdd59e4e 100644 --- a/hatchet/graphframe.py +++ b/hatchet/graphframe.py @@ -40,6 +40,24 @@ def parallel_apply(filter_function, dataframe, queue): queue.put(filtered_df) +# TODO Move into global configuration when Connor's PR is merged +_format_extensions = { + ".hdf5": "hdf", + ".hdf": "hdf", + ".h5": "hdf", + ".csv": "csv", + ".xls": "excel", + ".xlsx": "excel", + ".xlsm": "excel", + ".xlsb": "excel", + ".odf": "excel", + ".ods": "excel", + ".odt": "excel", + ".pkl": "pickle", + ".pickle": "pickle", +} + + class GraphFrame: """An input dataset is read into an object of this type, which includes a graph and a dataframe. @@ -259,7 +277,90 @@ def from_lists(*lists): return gf @staticmethod - def from_hdf(filename, **kwargs): + def load(filename, fileformat=None, **kwargs): + format_priority = ["hdf", "pickle", "csv", "excel"] + fformat = fileformat + if fformat is None: + # TODO + # for ext in self._format_extensions.keys(): + for ext in _format_extensions.keys(): + if filename.endswith(ext): + # TODO + # fformat = self._format_extensions[ext] + fformat = _format_extensions[ext] + break + if fformat is not None and fformat in format_priority: + format_priority.remove(fformat) + try: + # TODO + # gf = self._load_func_dict[fformat](filename, **kwargs) + gf = _load_func_dict[fformat](filename, **kwargs) + print("Successfully saved to {}".format(fformat)) + return gf + except ImportError: + print( + "Could not load from {} format. Trying alternatives.".format( + fformat + ) + ) + for form in format_priority: + print("Trying {}".format(form)) + try: + # TODO + # gf = self._load_func_dict[form](filename, **kwargs) + gf = _load_func_dict[form](filename, **kwargs) + print("Sucessfully loaded from {}".format(form)) + return gf + except ImportError: + print("Could not load from {} format.".format(form)) + raise IOError( + "Could not parse {} with the available formats. Make sure you have the necessary dependencies installed.".format( + filename + ) + ) + + def save(self, filename, fileformat=None, **kwargs): + format_priority = ["hdf", "pickle", "csv", "excel"] + fformat = fileformat + if fformat is None: + # TODO + # for ext in self._format_extensions.keys(): + for ext in _format_extensions.keys(): + if filename.endswith(ext): + # TODO + # fformat = self._format_extensions[ext] + fformat = _format_extensions[ext] + break + if fformat is not None and fformat in format_priority: + format_priority.remove(fformat) + try: + # TODO + # self._save_func_dict[fformat](self, filename, **kwargs) + _save_func_dict[fformat](self, filename, **kwargs) + print("Successfully saved to {}".format(fformat)) + return + except ImportError: + print( + "Could not save to {} format. Trying alternatives.".format(fformat) + ) + for form in format_priority: + print("Trying {}".format(form)) + try: + # TODO + # self._save_func_dict[form](self, filename, **kwargs) + _save_func_dict[form](self, filename, **kwargs) + print("Successfully saved to {}".format(form)) + return + except ImportError: + print("Could not save to {} format.".format(form)) + raise IOError( + "Could not save {} with the available formats. Make sure you have the necessary dependencies installed.".format( + filename + ) + ) + + @staticmethod + def from_hdf(filename, key=None): # import this lazily to avoid circular dependencies from .readers.hdf5_reader import HDF5Reader @@ -277,10 +378,32 @@ def from_pickle(filename, **kwargs): return PickleReader(filename).read(**kwargs) - def to_pickle(filename, **kwargs): + def to_pickle(self, filename, **kwargs): from .writers.pickle_writer import PickleWriter - PickleWriter(filename).write(**kwargs) + PickleWriter(filename).write(self, **kwargs) + + @staticmethod + def from_csv(filename, **kwargs): + from .readers.csv_reader import CSVReader + + return CSVReader(filename).read(**kwargs) + + def to_csv(self, filename, **kwargs): + from .writers.csv_writer import CSVWriter + + CSVWriter(filename).write(self, **kwargs) + + @staticmethod + def from_excel(filename, **kwargs): + from .readers.excel_reader import ExcelReader + + return ExcelReader(filename).read(**kwargs) + + def to_excel(self, filename, **kwargs): + from .writers.excel_writer import ExcelWriter + + ExcelWriter(filename).write(self, **kwargs) def copy(self): """Return a shallow copy of the graphframe. @@ -1268,6 +1391,20 @@ def __imul__(self, other): return self._operator(other_copy, self.dataframe.mul) +_load_func_dict = { + "hdf": GraphFrame.from_hdf, + "csv": GraphFrame.from_csv, + "excel": GraphFrame.from_excel, + "pickle": GraphFrame.from_pickle, +} +_save_func_dict = { + "hdf": GraphFrame.to_hdf, + "csv": GraphFrame.to_csv, + "excel": GraphFrame.to_excel, + "pickle": GraphFrame.to_pickle, +} + + class InvalidFilter(Exception): """Raised when an invalid argument is passed to the filter function.""" diff --git a/hatchet/readers/csv_reader.py b/hatchet/readers/csv_reader.py new file mode 100644 index 00000000..756ece90 --- /dev/null +++ b/hatchet/readers/csv_reader.py @@ -0,0 +1,16 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +import pandas as pd +from .pandas_reader import PandasReader + + +class CSVReader(PandasReader): + def __init__(self, filename): + # TODO Remove Arguments when Python 2.7 support is dropped + super(CSVReader, self).__init__(filename) + + def _read_from_file_type(self, **kwargs): + return pd.read_csv(self.fname, **kwargs) diff --git a/hatchet/readers/excel_reader.py b/hatchet/readers/excel_reader.py new file mode 100644 index 00000000..1a4b82bd --- /dev/null +++ b/hatchet/readers/excel_reader.py @@ -0,0 +1,16 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +import pandas as pd +from .pandas_reader import PandasReader + + +class ExcelReader(PandasReader): + def __init__(self, filename): + # TODO Remove Arguments when Python 2.7 support is dropped + super(ExcelReader, self).__init__(filename) + + def _read_from_file_type(self, **kwargs): + return pd.read_excel(self.fname, **kwargs) diff --git a/hatchet/readers/pickle_reader.py b/hatchet/readers/pickle_reader.py index 627807b8..f1c26e4f 100644 --- a/hatchet/readers/pickle_reader.py +++ b/hatchet/readers/pickle_reader.py @@ -10,7 +10,7 @@ class PickleReader(PandasReader): def __init__(self, filename): # TODO Remove Arguments when Python 2.7 support is dropped - super(HDF5Reader, self).__init__(filename) + super(PickleReader, self).__init__(filename) def _read_from_file_type(self, **kwargs): return pd.read_pickle(self.fname, **kwargs) diff --git a/hatchet/tests/graphframe.py b/hatchet/tests/graphframe.py index 8a892446..9986826d 100644 --- a/hatchet/tests/graphframe.py +++ b/hatchet/tests/graphframe.py @@ -1171,3 +1171,120 @@ def test_hdf_load_store(mock_graph_literal): if os.path.exists("test_gframe.hdf"): os.remove("test_gframe.hdf") + + +def test_pickle_load_store(mock_graph_literal): + if os.path.exists("test_gframe.pkl"): + os.remove("test_gframe.pkl") + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.to_pickle("test_gframe.pkl") + gf_loaded = GraphFrame.from_pickle("test_gframe.pkl") + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists("test_gframe.pkl"): + os.remove("test_gframe.pkl") + + +def test_csv_load_store(mock_graph_literal): + if os.path.exists("test_gframe.csv"): + os.remove("test_gframe.csv") + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.to_csv("test_gframe.csv") + gf_loaded = GraphFrame.from_csv("test_gframe.csv") + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists("test_gframe.csv"): + os.remove("test_gframe.csv") + + +def test_excel_load_store(mock_graph_literal): + if os.path.exists("test_gframe.xlsx"): + os.remove("test_gframe.xlsx") + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.to_excel("test_gframe.xlsx") + gf_loaded = GraphFrame.from_excel("test_gframe.xlsx") + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists("test_gframe.xlsx"): + os.remove("test_gframe.xlsx") + + +def test_save_func_w_extension(mock_graph_literal): + fname = "test_gframe.hdf" + if os.path.exists(fname): + os.remove(fname) + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.save(fname) + gf_loaded = GraphFrame.from_hdf(fname) + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists(fname): + os.remove(fname) + + +def test_save_func_w_manual_format(mock_graph_literal): + fname = "test_gframe" + if os.path.exists(fname): + os.remove(fname) + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.save(fname, fileformat="hdf") + gf_loaded = GraphFrame.from_hdf(fname) + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists(fname): + os.remove(fname) + + +def test_load_func_w_extension(mock_graph_literal): + fname = "test_gframe.pkl" + if os.path.exists(fname): + os.remove(fname) + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.to_pickle(fname) + gf_loaded = GraphFrame.load(fname) + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists(fname): + os.remove(fname) + + +def test_load_func_w_manual_format(mock_graph_literal): + fname = "test_gframe" + if os.path.exists(fname): + os.remove(fname) + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.to_hdf(fname) + gf_loaded = GraphFrame.load(fname, fileformat="hdf") + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists(fname): + os.remove(fname) + + +def test_save_load_func_w_guessing_format(mock_graph_literal): + fname = "test_gframe" + if os.path.exists(fname): + os.remove(fname) + gf_orig = GraphFrame.from_literal(mock_graph_literal) + gf_orig.save(fname) + gf_loaded = GraphFrame.load(fname) + + assert gf_orig.dataframe.equals(gf_loaded.dataframe) + assert gf_orig.graph == gf_loaded.graph + + if os.path.exists(fname): + os.remove(fname) diff --git a/hatchet/writers/csv_writer.py b/hatchet/writers/csv_writer.py new file mode 100644 index 00000000..97ba73a5 --- /dev/null +++ b/hatchet/writers/csv_writer.py @@ -0,0 +1,15 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +from .pandas_writer import PandasWriter + + +class CSVWriter(PandasWriter): + def __init__(self, filename): + # TODO Remove Arguments when Python 2.7 support is dropped + super(CSVWriter, self).__init__(filename) + + def _write_to_file_type(self, df, **kwargs): + df.to_csv(self.fname, **kwargs) diff --git a/hatchet/writers/excel_writer.py b/hatchet/writers/excel_writer.py new file mode 100644 index 00000000..dc17e74b --- /dev/null +++ b/hatchet/writers/excel_writer.py @@ -0,0 +1,15 @@ +# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +from .pandas_writer import PandasWriter + + +class ExcelWriter(PandasWriter): + def __init__(self, filename): + # TODO Remove Arguments when Python 2.7 support is dropped + super(ExcelWriter, self).__init__(filename) + + def _write_to_file_type(self, df, **kwargs): + df.to_excel(self.fname, **kwargs) diff --git a/hatchet/writers/pickle_writer.py b/hatchet/writers/pickle_writer.py index abf1e152..1e82980d 100644 --- a/hatchet/writers/pickle_writer.py +++ b/hatchet/writers/pickle_writer.py @@ -3,15 +3,13 @@ # # SPDX-License-Identifier: MIT -import warnings - from .pandas_writer import PandasWriter class PickleWriter(PandasWriter): def __init__(self, filename): # TODO Remove Arguments when Python 2.7 support is dropped - super(HDF5Writer, self).__init__(filename) + super(PickleWriter, self).__init__(filename) def _write_to_file_type(self, df, **kwargs): df.to_pickle(self.fname, **kwargs) From 4fabb1a7cd533e6ec285f1b44fe1cabce0493c70 Mon Sep 17 00:00:00 2001 From: ilumsden Date: Sat, 22 May 2021 07:58:13 -0400 Subject: [PATCH 3/9] Changes keyword argument parsing for to_hdf and from_hdf to use **kwargs --- hatchet/graphframe.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py index fdd59e4e..ee1f0f41 100644 --- a/hatchet/graphframe.py +++ b/hatchet/graphframe.py @@ -360,17 +360,24 @@ def save(self, filename, fileformat=None, **kwargs): ) @staticmethod - def from_hdf(filename, key=None): + def from_hdf(filename, key=None, **kwargs): # import this lazily to avoid circular dependencies from .readers.hdf5_reader import HDF5Reader - return HDF5Reader(filename).read(**kwargs) + return HDF5Reader(filename).read(key=key, **kwargs) - def to_hdf(self, filename, key="hatchet_graphframe", **kwargs): + def to_hdf( + self, + filename, + key="hatchet_graphframe", + **kwargs + ): # import this lazily to avoid circular dependencies from .writers.hdf5_writer import HDF5Writer - HDF5Writer(filename).write(self, key=key, **kwargs) + HDF5Writer(filename).write( + self, key=key, **kwargs + ) @staticmethod def from_pickle(filename, **kwargs): From 4def1f73f91e03073621c3230648f1fb74f3eb39 Mon Sep 17 00:00:00 2001 From: ilumsden Date: Sat, 22 May 2021 07:58:46 -0400 Subject: [PATCH 4/9] Changes keyword argument parsing for to_hdf and from_hdf to use **kwargs Adds all possible optional dependencies for checkpointing to CI Changes keyword argument parsing for to_hdf and from_hdf to use **kwargs Adds all possible optional dependencies for checkpointing to CI --- .github/workflows/unit-tests.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 555dbfee..9b3584c5 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -33,7 +33,9 @@ jobs: python -m pip install --upgrade pip flake8 pytest pip install -r requirements.txt # Optional Dependency for HDF Checkpointing - pip install tables + python -m pip install tables + # All Possible Optional Dependencies for Excel Checkpointing + python -m pip install XlsxWriter openpyxl pyxlsb xlrd xlwt python setup.py install python setup.py build_ext --inplace python -m pip list From ae905dca28020c6b206c9af237661b170d94aec1 Mon Sep 17 00:00:00 2001 From: ilumsden Date: Fri, 28 May 2021 13:22:50 -0400 Subject: [PATCH 5/9] Removes manual parsing of kwargs in HDF5Reader and HDF5Writer --- hatchet/graphframe.py | 11 ++--------- hatchet/writers/hdf5_writer.py | 6 +----- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py index ee1f0f41..893db511 100644 --- a/hatchet/graphframe.py +++ b/hatchet/graphframe.py @@ -366,18 +366,11 @@ def from_hdf(filename, key=None, **kwargs): return HDF5Reader(filename).read(key=key, **kwargs) - def to_hdf( - self, - filename, - key="hatchet_graphframe", - **kwargs - ): + def to_hdf(self, filename, key="hatchet_graphframe", **kwargs): # import this lazily to avoid circular dependencies from .writers.hdf5_writer import HDF5Writer - HDF5Writer(filename).write( - self, key=key, **kwargs - ) + HDF5Writer(filename).write(self, key=key, **kwargs) @staticmethod def from_pickle(filename, **kwargs): diff --git a/hatchet/writers/hdf5_writer.py b/hatchet/writers/hdf5_writer.py index 24250116..014df583 100644 --- a/hatchet/writers/hdf5_writer.py +++ b/hatchet/writers/hdf5_writer.py @@ -17,10 +17,6 @@ def __init__(self, filename): super().__init__(filename) def _write_dataframe_to_file(self, df, **kwargs): - if "key" not in kwargs: - raise KeyError("Writing to HDF5 requires a user-supplied key") - key = kwargs["key"] - del kwargs["key"] with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=Warning) - df.to_hdf(self.filename, key, **kwargs) + df.to_hdf(self.filename, **kwargs) From cac4985afb6fddab9aa350b8bea6e5b2b9f144a3 Mon Sep 17 00:00:00 2001 From: ilumsden Date: Fri, 21 May 2021 12:34:51 -0400 Subject: [PATCH 6/9] Gets the CSV and Excel Readers and Writers working by manually picling nodes and converting string representations of lists back to lists --- hatchet/readers/csv_reader.py | 50 ++++++++++++++++++++++++++++++++- hatchet/readers/excel_reader.py | 48 ++++++++++++++++++++++++++++++- hatchet/tests/graphframe.py | 10 +++++++ hatchet/writers/csv_writer.py | 12 ++++++++ hatchet/writers/excel_writer.py | 10 +++++++ 5 files changed, 128 insertions(+), 2 deletions(-) diff --git a/hatchet/readers/csv_reader.py b/hatchet/readers/csv_reader.py index 756ece90..7cff0a2b 100644 --- a/hatchet/readers/csv_reader.py +++ b/hatchet/readers/csv_reader.py @@ -2,10 +2,43 @@ # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT +from __future__ import unicode_literals +from ast import literal_eval import pandas as pd from .pandas_reader import PandasReader +import pickle + + +def _unpickle_series_elems(pd_series): + # unpickled_elems = [pickle.loads(e.encode("utf-8")) for e in pd_series] + unpickled_elems = [] + for e in pd_series: + e_bytes = literal_eval(e) + unpickled_elems.append(pickle.loads(e_bytes)) + return pd.Series(unpickled_elems) + + +def _corrrect_children_and_parent_col_types(df): + new_children_col = [] + for c in df["children"]: + if not isinstance(c, list): + new_val = literal_eval(c) + new_children_col.append(new_val) + else: + new_children_col.append(c) + df["children"] = pd.Series(new_children_col) + new_parent_col = [] + for p in df["parents"]: + if not isinstance(p, list): + new_val = literal_eval(p) + new_parent_col.append(new_val) + else: + new_parent_col.append(p) + df["parents"] = pd.Series(new_parent_col) + return df + class CSVReader(PandasReader): def __init__(self, filename): @@ -13,4 +46,19 @@ def __init__(self, filename): super(CSVReader, self).__init__(filename) def _read_from_file_type(self, **kwargs): - return pd.read_csv(self.fname, **kwargs) + index_col = None + if "index_col" in kwargs: + index_col = kwargs["index_col"] + del kwargs["index_col"] + csv_df = pd.read_csv(self.fname, index_col=0, **kwargs) + csv_df["node"] = _unpickle_series_elems(csv_df["node"]) + csv_df = _corrrect_children_and_parent_col_types(csv_df) + if index_col is not None: + return csv_df.reset_index(drop=True).set_index(index_col) + multindex_cols = ["node", "rank", "thread"] + while len(multindex_cols) > 0: + if set(multindex_cols).issubset(csv_df.columns): + return csv_df.reset_index(drop=True).set_index(multindex_cols) + multindex_cols.pop() + # TODO Replace with a custom error + raise RuntimeError("Could not generate a valid Index or MultiIndex") diff --git a/hatchet/readers/excel_reader.py b/hatchet/readers/excel_reader.py index 1a4b82bd..7eb5cb64 100644 --- a/hatchet/readers/excel_reader.py +++ b/hatchet/readers/excel_reader.py @@ -3,9 +3,40 @@ # # SPDX-License-Identifier: MIT +from ast import literal_eval import pandas as pd from .pandas_reader import PandasReader +import pickle + + +def _unpickle_series_elems(pd_series): + unpickled_elems = [] + for e in pd_series: + e_bytes = literal_eval(e) + unpickled_elems.append(pickle.loads(e_bytes)) + return pd.Series(unpickled_elems) + + +def _corrrect_children_and_parent_col_types(df): + new_children_col = [] + for c in df["children"]: + if not isinstance(c, list): + new_val = literal_eval(c) + new_children_col.append(new_val) + else: + new_children_col.append(c) + df["children"] = pd.Series(new_children_col) + new_parent_col = [] + for p in df["parents"]: + if not isinstance(p, list): + new_val = literal_eval(p) + new_parent_col.append(new_val) + else: + new_parent_col.append(p) + df["parents"] = pd.Series(new_parent_col) + return df + class ExcelReader(PandasReader): def __init__(self, filename): @@ -13,4 +44,19 @@ def __init__(self, filename): super(ExcelReader, self).__init__(filename) def _read_from_file_type(self, **kwargs): - return pd.read_excel(self.fname, **kwargs) + index_col = None + if "index_col" in kwargs: + index_col = kwargs["index_col"] + del kwargs["index_col"] + csv_df = pd.read_excel(self.fname, index_col=0, **kwargs) + csv_df["node"] = _unpickle_series_elems(csv_df["node"]) + csv_df = _corrrect_children_and_parent_col_types(csv_df) + if index_col is not None: + return csv_df.reset_index(drop=True).set_index(index_col) + multindex_cols = ["node", "rank", "thread"] + while len(multindex_cols) > 0: + if set(multindex_cols).issubset(csv_df.columns): + return csv_df.reset_index(drop=True).set_index(multindex_cols) + multindex_cols.pop() + # TODO Replace with a custom error + raise RuntimeError("Could not generate a valid Index or MultiIndex") diff --git a/hatchet/tests/graphframe.py b/hatchet/tests/graphframe.py index 9986826d..b7a80883 100644 --- a/hatchet/tests/graphframe.py +++ b/hatchet/tests/graphframe.py @@ -1208,6 +1208,16 @@ def test_excel_load_store(mock_graph_literal): gf_orig.to_excel("test_gframe.xlsx") gf_loaded = GraphFrame.from_excel("test_gframe.xlsx") + # Excel will convert integers represented as floats back into integers. + # To ensure "equals" evaluates correctly, I manually cast the "time" and "time (inc)" + # columns back to float + gf_loaded.dataframe["time"] = gf_loaded.dataframe["time"].astype( + gf_orig.dataframe.dtypes["time"] + ) + gf_loaded.dataframe["time (inc)"] = gf_loaded.dataframe["time (inc)"].astype( + gf_orig.dataframe.dtypes["time (inc)"] + ) + assert gf_orig.dataframe.equals(gf_loaded.dataframe) assert gf_orig.graph == gf_loaded.graph diff --git a/hatchet/writers/csv_writer.py b/hatchet/writers/csv_writer.py index 97ba73a5..d86256b9 100644 --- a/hatchet/writers/csv_writer.py +++ b/hatchet/writers/csv_writer.py @@ -5,6 +5,14 @@ from .pandas_writer import PandasWriter +import pandas as pd +import pickle + + +def pickle_series_elems(pd_series): + pickled_elems = [pickle.dumps(e) for e in pd_series] + return pd.Series(pickled_elems) + class CSVWriter(PandasWriter): def __init__(self, filename): @@ -12,4 +20,8 @@ def __init__(self, filename): super(CSVWriter, self).__init__(filename) def _write_to_file_type(self, df, **kwargs): + df.reset_index(inplace=True) + df["node"] = pickle_series_elems(df["node"]) + df["children"] = df["children"].apply(str, convert_dtype=True) + df["parents"] = df["parents"].apply(str, convert_dtype=True) df.to_csv(self.fname, **kwargs) diff --git a/hatchet/writers/excel_writer.py b/hatchet/writers/excel_writer.py index dc17e74b..d9d764c1 100644 --- a/hatchet/writers/excel_writer.py +++ b/hatchet/writers/excel_writer.py @@ -5,6 +5,14 @@ from .pandas_writer import PandasWriter +import pandas as pd +import pickle + + +def pickle_series_elems(pd_series): + pickled_elems = [pickle.dumps(e) for e in pd_series] + return pd.Series(pickled_elems) + class ExcelWriter(PandasWriter): def __init__(self, filename): @@ -12,4 +20,6 @@ def __init__(self, filename): super(ExcelWriter, self).__init__(filename) def _write_to_file_type(self, df, **kwargs): + df.reset_index(inplace=True) + df["node"] = pickle_series_elems(df["node"]) df.to_excel(self.fname, **kwargs) From 16b4b1e2d0917ad656434eb6bfce21eba9125462 Mon Sep 17 00:00:00 2001 From: ilumsden Date: Fri, 21 May 2021 13:50:57 -0400 Subject: [PATCH 7/9] Adds the necessary dependencies for Excel reading and writing to CI --- .github/workflows/unit-tests.yaml | 3 ++- hatchet/tests/graphframe.py | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 9b3584c5..f6076a26 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -35,7 +35,8 @@ jobs: # Optional Dependency for HDF Checkpointing python -m pip install tables # All Possible Optional Dependencies for Excel Checkpointing - python -m pip install XlsxWriter openpyxl pyxlsb xlrd xlwt + # python -m pip install XlsxWriter openpyxl pyxlsb xlrd xlwt + python -m pip install xlrd xlwt python setup.py install python setup.py build_ext --inplace python -m pip list diff --git a/hatchet/tests/graphframe.py b/hatchet/tests/graphframe.py index b7a80883..ee9adf89 100644 --- a/hatchet/tests/graphframe.py +++ b/hatchet/tests/graphframe.py @@ -1202,11 +1202,11 @@ def test_csv_load_store(mock_graph_literal): def test_excel_load_store(mock_graph_literal): - if os.path.exists("test_gframe.xlsx"): - os.remove("test_gframe.xlsx") + if os.path.exists("test_gframe.xls"): + os.remove("test_gframe.xls") gf_orig = GraphFrame.from_literal(mock_graph_literal) - gf_orig.to_excel("test_gframe.xlsx") - gf_loaded = GraphFrame.from_excel("test_gframe.xlsx") + gf_orig.to_excel("test_gframe.xls") + gf_loaded = GraphFrame.from_excel("test_gframe.xls") # Excel will convert integers represented as floats back into integers. # To ensure "equals" evaluates correctly, I manually cast the "time" and "time (inc)" @@ -1221,8 +1221,8 @@ def test_excel_load_store(mock_graph_literal): assert gf_orig.dataframe.equals(gf_loaded.dataframe) assert gf_orig.graph == gf_loaded.graph - if os.path.exists("test_gframe.xlsx"): - os.remove("test_gframe.xlsx") + if os.path.exists("test_gframe.xls"): + os.remove("test_gframe.xls") def test_save_func_w_extension(mock_graph_literal): From 9d02b20bc0b1078daf5bde2022d9e22571ac0205 Mon Sep 17 00:00:00 2001 From: ilumsden Date: Sat, 22 May 2021 07:11:49 -0400 Subject: [PATCH 8/9] Adds a Python verison guard to the use of ast.literal_eval for pickled nodes because of the lack of bytes type in Python 2 --- hatchet/readers/csv_reader.py | 7 ++++++- hatchet/readers/excel_reader.py | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/hatchet/readers/csv_reader.py b/hatchet/readers/csv_reader.py index 7cff0a2b..8aaa4deb 100644 --- a/hatchet/readers/csv_reader.py +++ b/hatchet/readers/csv_reader.py @@ -9,13 +9,18 @@ from .pandas_reader import PandasReader import pickle +import sys def _unpickle_series_elems(pd_series): # unpickled_elems = [pickle.loads(e.encode("utf-8")) for e in pd_series] unpickled_elems = [] for e in pd_series: - e_bytes = literal_eval(e) + e_bytes = e + print(sys.version_info) + print(sys.version_info >= (3,)) + if sys.version_info >= (3,): + e_bytes = literal_eval(e) unpickled_elems.append(pickle.loads(e_bytes)) return pd.Series(unpickled_elems) diff --git a/hatchet/readers/excel_reader.py b/hatchet/readers/excel_reader.py index 7eb5cb64..970eaa65 100644 --- a/hatchet/readers/excel_reader.py +++ b/hatchet/readers/excel_reader.py @@ -8,12 +8,15 @@ from .pandas_reader import PandasReader import pickle +import sys def _unpickle_series_elems(pd_series): unpickled_elems = [] for e in pd_series: - e_bytes = literal_eval(e) + e_bytes = e + if sys.version_info >= (3,): + e_bytes = literal_eval(e) unpickled_elems.append(pickle.loads(e_bytes)) return pd.Series(unpickled_elems) From c1963a538087eb429d1a9fd002ed9bcbdc63414b Mon Sep 17 00:00:00 2001 From: ilumsden Date: Fri, 28 May 2021 13:19:16 -0400 Subject: [PATCH 9/9] Updates CSV, Excel, and Pickle readers and writers to account for changes merged from develop --- hatchet/readers/csv_reader.py | 20 +++++++++++--------- hatchet/readers/excel_reader.py | 16 +++++++++------- hatchet/readers/hdf5_reader.py | 1 - hatchet/readers/pickle_reader.py | 18 +++++++++++------- hatchet/writers/csv_writer.py | 18 +++++++++++------- hatchet/writers/excel_writer.py | 18 +++++++++++------- hatchet/writers/pickle_writer.py | 18 +++++++++++------- 7 files changed, 64 insertions(+), 45 deletions(-) diff --git a/hatchet/readers/csv_reader.py b/hatchet/readers/csv_reader.py index 8aaa4deb..52d3b2a2 100644 --- a/hatchet/readers/csv_reader.py +++ b/hatchet/readers/csv_reader.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Copyright 2017-2021 Lawrence Livermore National Security, LLC and other # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT @@ -6,7 +6,7 @@ from ast import literal_eval import pandas as pd -from .pandas_reader import PandasReader +from .dataframe_reader import DataframeReader import pickle import sys @@ -25,7 +25,7 @@ def _unpickle_series_elems(pd_series): return pd.Series(unpickled_elems) -def _corrrect_children_and_parent_col_types(df): +def _correct_children_and_parent_col_types(df): new_children_col = [] for c in df["children"]: if not isinstance(c, list): @@ -45,19 +45,21 @@ def _corrrect_children_and_parent_col_types(df): return df -class CSVReader(PandasReader): +class CSVReader(DataframeReader): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped - super(CSVReader, self).__init__(filename) + if sys.version_info[0] == 2: + super(CSVReader, self).__init__(filename) + else: + super().__init__(filename) - def _read_from_file_type(self, **kwargs): + def _read_dataframe_from_file(self, **kwargs): index_col = None if "index_col" in kwargs: index_col = kwargs["index_col"] del kwargs["index_col"] - csv_df = pd.read_csv(self.fname, index_col=0, **kwargs) + csv_df = pd.read_csv(self.filename, index_col=0, **kwargs) csv_df["node"] = _unpickle_series_elems(csv_df["node"]) - csv_df = _corrrect_children_and_parent_col_types(csv_df) + csv_df = _correct_children_and_parent_col_types(csv_df) if index_col is not None: return csv_df.reset_index(drop=True).set_index(index_col) multindex_cols = ["node", "rank", "thread"] diff --git a/hatchet/readers/excel_reader.py b/hatchet/readers/excel_reader.py index 970eaa65..13c79779 100644 --- a/hatchet/readers/excel_reader.py +++ b/hatchet/readers/excel_reader.py @@ -1,11 +1,11 @@ -# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Copyright 2017-2021 Lawrence Livermore National Security, LLC and other # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT from ast import literal_eval import pandas as pd -from .pandas_reader import PandasReader +from .dataframe_reader import DataframeReader import pickle import sys @@ -41,17 +41,19 @@ def _corrrect_children_and_parent_col_types(df): return df -class ExcelReader(PandasReader): +class ExcelReader(DataframeReader): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped - super(ExcelReader, self).__init__(filename) + if sys.version_info[0] == 2: + super(ExcelReader, self).__init__(filename) + else: + super().__init__(filename) - def _read_from_file_type(self, **kwargs): + def _read_dataframe_from_file(self, **kwargs): index_col = None if "index_col" in kwargs: index_col = kwargs["index_col"] del kwargs["index_col"] - csv_df = pd.read_excel(self.fname, index_col=0, **kwargs) + csv_df = pd.read_excel(self.filename, index_col=0, **kwargs) csv_df["node"] = _unpickle_series_elems(csv_df["node"]) csv_df = _corrrect_children_and_parent_col_types(csv_df) if index_col is not None: diff --git a/hatchet/readers/hdf5_reader.py b/hatchet/readers/hdf5_reader.py index c25f2322..c3e2df75 100644 --- a/hatchet/readers/hdf5_reader.py +++ b/hatchet/readers/hdf5_reader.py @@ -11,7 +11,6 @@ class HDF5Reader(DataframeReader): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped if sys.version_info[0] == 2: super(HDF5Reader, self).__init__(filename) else: diff --git a/hatchet/readers/pickle_reader.py b/hatchet/readers/pickle_reader.py index f1c26e4f..ca516c5b 100644 --- a/hatchet/readers/pickle_reader.py +++ b/hatchet/readers/pickle_reader.py @@ -1,16 +1,20 @@ -# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Copyright 2017-2021 Lawrence Livermore National Security, LLC and other # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT import pandas as pd -from .pandas_reader import PandasReader +from .dataframe_reader import DataframeReader +import sys -class PickleReader(PandasReader): + +class PickleReader(DataframeReader): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped - super(PickleReader, self).__init__(filename) + if sys.version_info[0] == 2: + super(PickleReader, self).__init__(filename) + else: + super().__init__(filename) - def _read_from_file_type(self, **kwargs): - return pd.read_pickle(self.fname, **kwargs) + def _read_dataframe_from_file(self, **kwargs): + return pd.read_pickle(self.filename, **kwargs) diff --git a/hatchet/writers/csv_writer.py b/hatchet/writers/csv_writer.py index d86256b9..3e216b28 100644 --- a/hatchet/writers/csv_writer.py +++ b/hatchet/writers/csv_writer.py @@ -1,27 +1,31 @@ -# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Copyright 2017-2021 Lawrence Livermore National Security, LLC and other # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT -from .pandas_writer import PandasWriter +from .dataframe_writer import DataframeWriter import pandas as pd import pickle +import sys + def pickle_series_elems(pd_series): pickled_elems = [pickle.dumps(e) for e in pd_series] return pd.Series(pickled_elems) -class CSVWriter(PandasWriter): +class CSVWriter(DataframeWriter): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped - super(CSVWriter, self).__init__(filename) + if sys.version_info[0] == 2: + super(CSVWriter, self).__init__(filename) + else: + super().__init__(filename) - def _write_to_file_type(self, df, **kwargs): + def _write_dataframe_to_file(self, df, **kwargs): df.reset_index(inplace=True) df["node"] = pickle_series_elems(df["node"]) df["children"] = df["children"].apply(str, convert_dtype=True) df["parents"] = df["parents"].apply(str, convert_dtype=True) - df.to_csv(self.fname, **kwargs) + df.to_csv(self.filename, **kwargs) diff --git a/hatchet/writers/excel_writer.py b/hatchet/writers/excel_writer.py index d9d764c1..4e22a4d0 100644 --- a/hatchet/writers/excel_writer.py +++ b/hatchet/writers/excel_writer.py @@ -1,25 +1,29 @@ -# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Copyright 2017-2021 Lawrence Livermore National Security, LLC and other # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT -from .pandas_writer import PandasWriter +from .dataframe_writer import DataframeWriter import pandas as pd import pickle +import sys + def pickle_series_elems(pd_series): pickled_elems = [pickle.dumps(e) for e in pd_series] return pd.Series(pickled_elems) -class ExcelWriter(PandasWriter): +class ExcelWriter(DataframeWriter): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped - super(ExcelWriter, self).__init__(filename) + if sys.version_info[0] == 2: + super(ExcelWriter, self).__init__(filename) + else: + super().__init__(filename) - def _write_to_file_type(self, df, **kwargs): + def _write_dataframe_to_file(self, df, **kwargs): df.reset_index(inplace=True) df["node"] = pickle_series_elems(df["node"]) - df.to_excel(self.fname, **kwargs) + df.to_excel(self.filename, **kwargs) diff --git a/hatchet/writers/pickle_writer.py b/hatchet/writers/pickle_writer.py index 1e82980d..85d8765c 100644 --- a/hatchet/writers/pickle_writer.py +++ b/hatchet/writers/pickle_writer.py @@ -1,15 +1,19 @@ -# Copyright 2017-2020 Lawrence Livermore National Security, LLC and other +# Copyright 2017-2021 Lawrence Livermore National Security, LLC and other # Hatchet Project Developers. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT -from .pandas_writer import PandasWriter +from .dataframe_writer import DataframeWriter +import sys -class PickleWriter(PandasWriter): + +class PickleWriter(DataframeWriter): def __init__(self, filename): - # TODO Remove Arguments when Python 2.7 support is dropped - super(PickleWriter, self).__init__(filename) + if sys.version_info[0] == 2: + super(PickleWriter, self).__init__(filename) + else: + super().__init__(filename) - def _write_to_file_type(self, df, **kwargs): - df.to_pickle(self.fname, **kwargs) + def _write_dataframe_to_file(self, df, **kwargs): + df.to_pickle(self.filename, **kwargs)