Skip to content

Implementing multistart version of theta_est using multiple sampling methods #3575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 209 additions & 0 deletions pyomo/contrib/parmest/parmest.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def SSE(model):
return expr


'''Adding pseudocode for draft implementation of the estimator class,
incorporating multistart.
'''
class Estimator(object):
"""
Parameter estimation class
Expand Down Expand Up @@ -273,8 +276,18 @@ def __init__(
tee=False,
diagnostic_mode=False,
solver_options=None,
# Add the extra arguments needed for running the multistart implement
# _validate_multistart_args:
# if n_restarts > 1 and theta_samplig_method is not None:
n_restarts=20,
multistart_sampling_method="random",
):

'''first theta would be provided by the user in the initialization of
the Estimator class through the unknown parameter variables. Additional
would need to be generated using the sampling method provided by the user.
'''

# check that we have a (non-empty) list of experiments
assert isinstance(experiment_list, list)
self.exp_list = experiment_list
Expand All @@ -300,6 +313,10 @@ def __init__(
self.diagnostic_mode = diagnostic_mode
self.solver_options = solver_options

# add the extra multistart arguments to the Estimator class
self.n_restarts = n_restarts
self.multistart_sampling_method = multistart_sampling_method

# TODO: delete this when the deprecated interface is removed
self.pest_deprecated = None

Expand Down Expand Up @@ -447,6 +464,88 @@ def TotalCost_rule(model):
parmest_model = utils.convert_params_to_vars(model, theta_names, fix_vars=False)

return parmest_model

# Make new private method, _generate_initial_theta:
# This method will be used to generate the initial theta values for multistart
# optimization. It will take the theta names and the initial theta values
# and return a dictionary of theta names and their corresponding values.
def _generate_initial_theta(self, parmest_model, seed=None):
if self.n_restarts == 1:
# If only one restart, return an empty list
return print("No multistart optimization needed. Please use normal theta_est()")

# Get the theta names and initial theta values
theta_names = self._return_theta_names()
initial_theta = [parmest_model.find_component(name)() for name in theta_names]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it better to use the suffix for this? The suffix value shouldn't change, but the theta value may if the model has been solved for some reason. I don't know if this is a potential issue but I think that grabbing these values from the suffixes would be more dummy-proof.


# Get the lower and upper bounds for the theta values
lower_bound = np.array([parmest_model.find_component(name).lb for name in theta_names])
upper_bound = np.array([parmest_model.find_component(name).ub for name in theta_names])
# Check if the lower and upper bounds are defined
if np.any(np.isnan(lower_bound)) or np.any(np.isnan(upper_bound)):
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably already know this, but you will need to check all the errors are raised when expected.

"The lower and upper bounds for the theta values must be defined."
)

# Check the length of theta_names and initial_theta, and make sure bounds are defined
if len(theta_names) != len(initial_theta):
raise ValueError(
"The length of theta_names and initial_theta must be the same."
)

if self.method == "random":
np.random.seed(seed)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to skip setting the random seed if seed=None (default)?

Copy link
Author

@sscini sscini May 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default is none for all the functions I use that set seed, so if it receives seed = None, it would work as expected. Would skipping it still be best practice?

# Generate random theta values
theta_vals_multistart = np.random.uniform(lower_bound, upper_bound, size=len(theta_names))

# Generate theta values using Latin hypercube sampling or Sobol sampling
return theta_vals_multistart

elif self.method == "latin_hypercube":
# Generate theta values using Latin hypercube sampling
sampler = scipy.stats.qmc.LatinHypercube(d=len(theta_names), seed=seed)
samples = sampler.random(n=self.n_restarts+1)[1:] # Skip the first sample
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you skipping the first sample? Please explain in the comments.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a comment in code to explain as well. The first sample generated using qmc.sobol is always the origin (zero vector). I thought logic applied to all qmc methods, but no only sobol. So to get nonzero points, you need to skip first sample

theta_vals_multistart = np.array([lower_bound + (upper_bound - lower_bound) * theta for theta in samples])


elif self.method == "sobol":
sampler = scipy.stats.qmc.Sobol(d=len(theta_names), seed=seed)
samples = sampler.random(n=self.n_restarts+1)[1:]
theta_vals_multistart = np.array([lower_bound + (upper_bound - lower_bound) * theta for theta in samples])

# elif self.method == "prior":
# # Still working on this
# theta_vals_multistart = np.array([lower_bound + (upper_bound - lower_bound) * theta for theta in initial_theta])

else:
raise ValueError(
"Invalid sampling method. Choose 'random', 'latin_hypercube', 'sobol'." # or 'prior'."
)

# Make an output dataframe with the theta names and their corresponding values for each restart,
# and nan for the output info values
df_multistart = pd.DataFrame(
theta_vals_multistart, columns=theta_names
)
df_multistart["initial objective"] = np.nan
df_multistart["final objective"] = np.nan
df_multistart["solver termination"] = np.nan
df_multistart["solve_time"] = np.nan

# Add the initial theta values to the first row of the dataframe
for i in self.n_restarts:
df_multistart.iloc[i, :] = theta_vals_multistart[i, :]
df_multistart.iloc[0, :] = initial_theta
# # Add the initial objective value to the first row of the dataframe
# df_multistart.iloc[0, -1] = self._Q_at_theta(initial_theta, initialize_parmest_model=True)[0]
# # Add the final objective value to the first row of the dataframe
# df_multistart.iloc[0, -2] = self._Q_at_theta(initial_theta, initialize_parmest_model=True)[0]
# # Add the solver termination value to the first row of the dataframe
# df_multistart.iloc[0, -3] = self._Q_at_theta(initial_theta, initialize_parmest_model=True)[2]
# # Add the solve time to the first row of the dataframe
# df_multistart.iloc[0, -4] = self._Q_at_theta(initial_theta, initialize_parmest_model=True)[3]

return theta_vals_multistart, df_multistart

def _instance_creation_callback(self, experiment_number=None, cb_data=None):
model = self._create_parmest_model(experiment_number)
Expand Down Expand Up @@ -921,6 +1020,116 @@ def theta_est(
cov_n=cov_n,
)

def theta_est_multistart(
self,
buffer=10,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to explain the buffer in the doc string.

save_results=False,
theta_vals=None,
solver="ef_ipopt",
return_values=[],
):
"""
Parameter estimation using multistart optimization

Parameters
----------
n_restarts: int, optional
Number of restarts for multistart. Default is 1.
theta_sampling_method: string, optional
Method used to sample theta values. Options are "random", "latin_hypercube", or "sobol".
Default is "random".
solver: string, optional
Currently only "ef_ipopt" is supported. Default is "ef_ipopt".
return_values: list, optional
List of Variable names, used to return values from the model for data reconciliation


Returns
-------
objectiveval: float
The objective function value
thetavals: pd.Series
Estimated values for theta
variable values: pd.DataFrame
Variable values for each variable name in return_values (only for solver='ef_ipopt')

"""

# check if we are using deprecated parmest
if self.pest_deprecated is not None:
return print(
"Multistart is not supported in the deprecated parmest interface"
)

assert isinstance(self.n_restarts, int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace all of these with more descriptive error messages. Remember that we need tests for each error message.

assert isinstance(self.multistart_sampling_method, str)
assert isinstance(solver, str)
assert isinstance(return_values, list)

if self.n_restarts > 1 and self.multistart_sampling_method is not None:
# Generate theta values using the sampling method
theta_vals, results_df = self._generate_initial_theta(
self.estimator_theta_names, self.initial_theta, self.n_restarts, self.multistart_sampling_method
)

# make empty list to store results
for i in range(self.n_restarts):
# for number of restarts, call the self._Q_opt method
# with the theta values generated using the _generalize_initial_theta method

# Call the _Q_opt method with the generated theta values
objectiveval, thetavals[i], variable_values = self._Q_opt(
ThetaVals=theta_vals,
solver=solver,
return_values=return_values,
)

# Check if the solver terminated successfully
if variable_values.solver.termination_condition != pyo.TerminationCondition.optimal:
# If not, set the objective value to NaN
solver_termination = variable_values.solver.termination_condition
solve_time = variable_values.solver.time
thetavals = np.nan

else:

# If the solver terminated successfully, set the objective value
init_objectiveval = objectiveval
final_objectiveval = variable_values.solver.objective()
solver_termination = variable_values.solver.termination_condition
solve_time = variable_values.solver.time

# Check if the objective value is better than the best objective value
if final_objectiveval < best_objectiveval:
best_objectiveval = objectiveval
best_theta = thetavals

# Store the results in a list or DataFrame
# depending on the number of restarts
results_df.iloc[i, :-4] = theta_vals
results_df.iloc[i, -4] = init_objectiveval
results_df.iloc[i, -3] = objectiveval
results_df.iloc[i, -2] = variable_values.solver.termination_condition
results_df.iloc[i, -1] = variable_values.solver.time

# Add buffer to save the dataframe dynamically, if save_results is True
if save_results and (i + 1) % buffer == 0:
mode = 'w' if i + 1 == buffer else 'a'
header = i + 1 == buffer
results_df.to_csv(
f"multistart_results.csv", mode=mode, header=header, index=False
)
print(f"Intermediate results saved after {i + 1} iterations.")

# Final save after all iterations
if save_results:
results_df.to_csv("multistart_results.csv", mode='a', header=False, index=False)
print("Final results saved.")

return results_df, best_theta, best_objectiveval



def theta_est_bootstrap(
self,
bootstrap_samples,
Expand Down