Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions thicket/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,12 +369,15 @@ def _handle_statsframe():
return combined_th

@staticmethod
def _index(thickets, from_statsframes=False, disable_tqdm=False):
def _index(
thickets, from_statsframes=False, fill_perfdata=True, disable_tqdm=False
):
"""Unify a list of thickets into a single thicket

Arguments:
thickets (list): list of Thicket objects
from_statsframes (bool): Whether this method was invoked from from_statsframes
fill_perfdata (bool): whether to fill missing performance data with NaNs
disable_tqdm (bool): whether to disable tqdm progress bar

Returns:
Expand Down Expand Up @@ -456,7 +459,8 @@ def _fill_perfdata(df, numerical_fill_value=np.nan):
validate_dataframe(unify_df)

# Insert missing rows in dataframe
unify_df = _fill_perfdata(unify_df)
if fill_perfdata:
unify_df = _fill_perfdata(unify_df)

# Sort PerfData
unify_df.sort_index(inplace=True)
Expand Down
46 changes: 40 additions & 6 deletions thicket/thicket.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,11 @@ def thicketize_graphframe(gf, prf):

@staticmethod
def from_caliper(
filename_or_stream, query=None, intersection=False, disable_tqdm=False
filename_or_stream,
query=None,
intersection=False,
fill_perfdata=True,
disable_tqdm=False,
):
"""Read in a Caliper .cali or .json file.

Expand All @@ -212,48 +216,62 @@ def from_caliper(
`.cali` or JSON-split format, or an open file object to read one
query (str): cali-query in CalQL format
intersection (bool): whether to perform intersection or union (default)
fill_perfdata (bool): whether to fill missing performance data with NaNs
disable_tqdm (bool): whether to display tqdm progress bar
"""
return Thicket.reader_dispatch(
GraphFrame.from_caliper,
intersection,
fill_perfdata,
disable_tqdm,
filename_or_stream,
query,
)

@staticmethod
def from_hpctoolkit(dirname, intersection=False, disable_tqdm=False):
def from_hpctoolkit(
dirname, intersection=False, fill_perfdata=True, disable_tqdm=False
):
"""Create a GraphFrame using hatchet's HPCToolkit reader and use its attributes
to make a new thicket.

Arguments:
dirname (str): parent directory of an HPCToolkit experiment.xml file
intersection (bool): whether to perform intersection or union (default)
fill_perfdata (bool): whether to fill missing performance data with NaNs
disable_tqdm (bool): whether to display tqdm progress bar

Returns:
(thicket): new thicket containing HPCToolkit profile data
"""
return Thicket.reader_dispatch(
GraphFrame.from_hpctoolkit, intersection, disable_tqdm, dirname
GraphFrame.from_hpctoolkit,
intersection,
fill_perfdata,
disable_tqdm,
dirname,
)

@staticmethod
def from_caliperreader(
filename_or_caliperreader, intersection=False, disable_tqdm=False
filename_or_caliperreader,
intersection=False,
fill_perfdata=True,
disable_tqdm=False,
):
"""Helper function to read one caliper file.

Arguments:
filename_or_caliperreader (str or CaliperReader): name of a Caliper output
file in `.cali` format, or a CaliperReader object
intersection (bool): whether to perform intersection or union (default)
fill_perfdata (bool): whether to fill missing performance data with NaNs
disable_tqdm (bool): whether to display tqdm progress bar
"""
return Thicket.reader_dispatch(
GraphFrame.from_caliperreader,
intersection,
fill_perfdata,
disable_tqdm,
filename_or_caliperreader,
)
Expand Down Expand Up @@ -295,7 +313,9 @@ def from_literal(graph_dict):
return tk

@staticmethod
def reader_dispatch(func, intersection, disable_tqdm, *args, **kwargs):
def reader_dispatch(
func, intersection, fill_perfdata, disable_tqdm, *args, **kwargs
):
"""Create a thicket from a list, directory of files, or a single file.

Arguments:
Expand Down Expand Up @@ -353,6 +373,7 @@ def reader_dispatch(func, intersection, disable_tqdm, *args, **kwargs):
thickets=ens_list,
axis="index",
calltree=calltree,
fill_perfdata=fill_perfdata,
disable_tqdm=disable_tqdm,
)

Expand All @@ -372,6 +393,7 @@ def concat_thickets(
calltree (str): calltree to use -> "union" or "intersection"

Keyword Arguments:
fill_perfdata (bool): (if axis="index") Whether to fill missing performance data with NaNs
headers (list): (if axis="columns") List of headers to use for the new columnar multi-index
metadata_key (str): (if axis="columns") Name of the column from the metadata tables to replace the 'profile'
index. If no argument is provided, it is assumed that there is no profile-wise
Expand All @@ -381,10 +403,16 @@ def concat_thickets(
(thicket): concatenated thicket
"""

def _index(thickets, from_statsframes=False, disable_tqdm=disable_tqdm):
def _index(
thickets,
from_statsframes=False,
fill_perfdata=True,
disable_tqdm=disable_tqdm,
):
thicket_parts = Ensemble._index(
thickets=thickets,
from_statsframes=from_statsframes,
fill_perfdata=fill_perfdata,
disable_tqdm=disable_tqdm,
)

Expand Down Expand Up @@ -1112,6 +1140,12 @@ def filter_metadata(self, select_function):
else:
raise InvalidFilter("The argument passed to filter must be a callable.")

# If fill_perfdata is False, may need to squash
if len(new_thicket.graph) != len(
new_thicket.dataframe.index.get_level_values("node").unique()
):
new_thicket = new_thicket.squash()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what this is trying to do. From what I can tell, this is trying to see if the profile list is a subset of the unique profile IDs in the performance dataframe index. If that's true, then the dataframe can store profile IDs that aren't in the Thicket.profile field.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is detecting if fill_perfdata is off. To make it more clear I thought of a couple options:

  1. I could make it a helper function with a docstring to make it more clear.
  2. Have a boolean thicket attribute for fill_perfdata that can be checked here.

The purpose of this is that there is a case when the index is not full, and a filter is performed on the metadata that nodes disappear from the performance data, if the metadata for those profiles was filtered out. The code above will update the graph using squash if this happens.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, if I'm understanding right, what you're trying to do is 1) check if any nodes were removed from the performance data as a side effect of filtering profiles from the metadata and 2) fix the graph with squash if any nodes were removed?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If what I wrote above is correct, then this code will not do what you're wanting.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code (assuming the set(Series) thing works) only checks if Thicket.profiles is a subset of the profile IDs in the performance dataframe index. That tells you nothing about the nodes.

Copy link
Collaborator Author

@michaelmckinsey1 michaelmckinsey1 Jun 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It tells you if there are profiles in the performance dataframe that do not exist in tk.profile. For example if I simulate removing a profile from the dataframe, it will be false which will trigger a squash:

image

The actual issue I noticed with this is that they will both always be in sync due to _sync_profile_components. So what I should check here instead is if len(tk.graph) != len(tk.dataframe.index.get_level_values("node").unique()) which is more explicit anyway. These sort of things will get caught when I add unit tests.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your code is not checking if there are profiles in the dataframe that don't exist in tk.profile. It's checking if there are profiles in tk.profile that don't exist in the dataframe. It's a subtle difference, but it would allow e.g., the dataframe to contain extra profiles that don't exist in tk.profiles.

I actually just encountered this same type of thing in Befikir's query_stats PR (#157), and I realized there's a better way to do this check:

sorted(tk.profile) == sorted(unique_perf_indices.tolist())

This will check that both the dataframe and tk.profile agree in terms of contents and length. Node objects are comparable via their _hatchet_nid fields. As a result, the Node objects within different parts of the same Thicket object can be compared in this way. This comparison will not work across Thicket objects though because there's no guarantee that equivalent Node objects will have the same _hatchet_nid values.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's checking if there are profiles in tk.profile that don't exist in the dataframe.

But that is what I was trying to accomplish with that code. That would indicate that there are missing profiles in the performance data, which is what I was saying. However, it doesn't address

it would allow e.g., the dataframe to contain extra profiles that don't exist in tk.profiles.

which your correct would be an issue, if not for _sync_profile_components being called earlier. Regardless, what I was actually trying to do is check nodes so I'm using a different check now.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference, if we are trying to update the state of the profiles in a Thicket we should be using _sync_profile_components. And if we are trying to validate properties about the state of a Thicket we should use validate_profile or validate_dataframe in utils.py

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're 100% positive that you've ensured the graph and dataframe contain the same node objects, then this is fine.


return new_thicket

def filter(self, filter_func):
Expand Down