Skip to content

Commit b771ff9

Browse files
authored
Merge pull request #316 from ibi-group/otp-runner-deployment
otp-runner deployment
2 parents ed944bc + 9a2ef56 commit b771ff9

File tree

6 files changed

+310
-231
lines changed

6 files changed

+310
-231
lines changed

src/main/java/com/conveyal/datatools/manager/jobs/DeployJob.java

+140-151
Large diffs are not rendered by default.

src/main/java/com/conveyal/datatools/manager/jobs/MonitorServerStatusJob.java

+60-69
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import com.conveyal.datatools.manager.auth.Auth0UserProfile;
2323
import com.conveyal.datatools.manager.models.Deployment;
2424
import com.conveyal.datatools.manager.models.OtpServer;
25+
import com.conveyal.datatools.manager.utils.json.JsonUtil;
2526
import com.fasterxml.jackson.annotation.JsonProperty;
2627
import org.apache.http.HttpEntity;
2728
import org.apache.http.client.methods.CloseableHttpResponse;
@@ -35,8 +36,7 @@
3536
import java.io.IOException;
3637
import java.util.Collections;
3738

38-
import static com.conveyal.datatools.manager.jobs.DeployJob.BUNDLE_DOWNLOAD_COMPLETE_FILE;
39-
import static com.conveyal.datatools.manager.jobs.DeployJob.GRAPH_STATUS_FILE;
39+
import static com.conveyal.datatools.manager.jobs.DeployJob.OTP_RUNNER_STATUS_FILE;
4040

4141
/**
4242
* Job that is dispatched during a {@link DeployJob} that spins up EC2 instances. This handles waiting for the server to
@@ -53,9 +53,9 @@ public class MonitorServerStatusJob extends MonitorableJob {
5353
private final AmazonEC2 ec2;
5454
private final AmazonElasticLoadBalancing elbClient;
5555
private final CloseableHttpClient httpClient = HttpClients.createDefault();
56-
// Delay checks by twenty seconds to give user-data script time to upload the instance's user data log if part of the
56+
// Delay checks by four seconds to give user-data script time to upload the instance's user data log if part of the
5757
// script fails (e.g., uploading or downloading a file).
58-
private static final int DELAY_SECONDS = 20;
58+
private static final int DELAY_SECONDS = 4;
5959
public long graphTaskSeconds;
6060

6161
public MonitorServerStatusJob(Auth0UserProfile owner, DeployJob deployJob, Instance instance, boolean graphAlreadyBuilt) {
@@ -93,86 +93,61 @@ public String getDeploymentId () {
9393

9494
@Override
9595
public void jobLogic() {
96-
String ipUrl = "http://" + instance.getPublicIpAddress();
9796
// Get OTP URL for instance to check for availability.
98-
boolean routerIsAvailable = false, graphIsAvailable = false;
97+
String ipUrl = "http://" + instance.getPublicIpAddress();
9998
if (otpServer.ec2Info == null || otpServer.ec2Info.targetGroupArn == null) {
10099
// Fail the job from the outset if there is no target group defined.
101100
failJob("There is no load balancer under which to register ec2 instance.");
102101
}
103102
try {
104-
if (graphAlreadyBuilt) {
105-
// If graph already build, instance's user data will download Graph.obj automatically instead of bundle.
106-
status.update("Loading graph...", 40);
107-
} else {
108-
// Otherwise, we need to verify that the bundle downloaded successfully.
109-
boolean bundleIsDownloaded = false;
110-
// Progressively check status of OTP server
111-
if (deployment.buildGraphOnly) {
112-
// No need to check that OTP is running. Just check to see that the graph is built.
113-
routerIsAvailable = true;
114-
}
115-
// First, check that OTP has started up.
116-
status.update("Prepping for graph build...", 20);
117-
String bundleUrl = String.join("/", ipUrl, BUNDLE_DOWNLOAD_COMPLETE_FILE);
118-
long bundleDownloadStartTime = System.currentTimeMillis();
119-
while (!bundleIsDownloaded) {
120-
// If the request is successful, the OTP instance has started.
121-
waitAndCheckInstanceHealth("bundle download check: " + bundleUrl);
122-
bundleIsDownloaded = checkForSuccessfulRequest(bundleUrl);
123-
// wait 20 minutes max for the bundle to download
124-
long maxBundleDownloadTimeMillis = 20 * 60 * 1000;
125-
if (taskHasTimedOut(bundleDownloadStartTime, maxBundleDownloadTimeMillis)) {
126-
failJob("Job timed out while checking for server bundle download status.");
127-
return;
128-
}
129-
}
130-
// Check status of bundle download and fail job if there was a failure.
131-
String bundleStatus = getUrlAsString(bundleUrl);
132-
if (bundleStatus == null || !bundleStatus.contains("SUCCESS")) {
133-
failJob("Failure encountered while downloading transit bundle.");
103+
// Wait for otp-runner to produce first status file
104+
long statusCheckStartTime = System.currentTimeMillis();
105+
String statusUrl = String.join("/", ipUrl, OTP_RUNNER_STATUS_FILE);
106+
boolean otpRunnerStatusAvailable = false;
107+
while (!otpRunnerStatusAvailable) {
108+
// If the request is successful, the OTP instance has started.
109+
waitAndCheckInstanceHealth("otp-runner status file availability check: " + statusUrl);
110+
otpRunnerStatusAvailable = checkForSuccessfulRequest(statusUrl);
111+
long maxOtpRunnerStartupTimeMillis = 5 * 60 * 1000;
112+
if (taskHasTimedOut(statusCheckStartTime, maxOtpRunnerStartupTimeMillis)) {
113+
failJob("Job timed out while waiting for otp-runner to produce a status file!");
134114
return;
135115
}
136-
long bundleDownloadSeconds = (System.currentTimeMillis() - bundleDownloadStartTime) / 1000;
137-
LOG.info("Bundle downloaded in {} seconds!", bundleDownloadSeconds);
138-
status.update("Building graph...", 30);
139116
}
140-
// Once bundle is downloaded, we await the build (or download if graph already built) of the graph.
141-
long graphCheckStartTime = System.currentTimeMillis();
142-
String graphStatusUrl = String.join("/", ipUrl, GRAPH_STATUS_FILE);
143-
while (!graphIsAvailable) {
117+
// Wait for otp-runner to write a status that fulfills expectations of this job
118+
statusCheckStartTime = System.currentTimeMillis();
119+
boolean otpRunnerCompleted = false;
120+
while (!otpRunnerCompleted) {
144121
// If the request is successful, the OTP instance has started.
145-
waitAndCheckInstanceHealth("graph build/download check: " + graphStatusUrl);
146-
graphIsAvailable = checkForSuccessfulRequest(graphStatusUrl);
147-
// wait a maximum of 4 hours if building the graph, or 20 minutes if downloading a graph
148-
long maxGraphBuildOrDownloadWaitTimeMillis = graphAlreadyBuilt ? 20 * 60 * 1000 : 4 * 60 * 60 * 1000;
149-
if (taskHasTimedOut(graphCheckStartTime, maxGraphBuildOrDownloadWaitTimeMillis)) {
150-
failJob("Job timed out while waiting for graph build/download. If this was a graph building machine, it may have run out of memory.");
122+
waitAndCheckInstanceHealth("otp-runner completion check: " + statusUrl);
123+
otpRunnerCompleted = checkForOtpRunnerCompletion(statusUrl);
124+
// Check if an otp-runner status file check has already failed this job.
125+
if (status.error) {
126+
return;
127+
}
128+
// wait a maximum of 5 hours if building a graph, or 1 hour if just starting a server
129+
long maxOtpRunnerWaitTimeMillis = (graphAlreadyBuilt ? 5 : 1) * 60 * 60 * 1000;
130+
if (taskHasTimedOut(statusCheckStartTime, maxOtpRunnerWaitTimeMillis)) {
131+
failJob("Job timed out while waiting for otp-runner to finish!");
151132
return;
152133
}
153134
}
154-
// Check graph status and fail job if there was a failure.
155-
String graphStatus = getUrlAsString(graphStatusUrl);
156-
if (graphStatus == null || !graphStatus.contains("SUCCESS")) {
157-
failJob("Failure encountered while building/downloading graph.");
158-
return;
159-
}
160-
graphTaskSeconds = (System.currentTimeMillis() - graphCheckStartTime) / 1000;
135+
graphTaskSeconds = (System.currentTimeMillis() - statusCheckStartTime) / 1000;
161136
String message = String.format("Graph build/download completed in %d seconds!", graphTaskSeconds);
162137
LOG.info(message);
163138
// If only task for this instance is to build the graph (either because that is the deployment purpose or
164139
// because this instance type/image is for graph building only), this machine's job is complete and we can
165140
// consider this job done.
166-
if (deployment.buildGraphOnly || (!graphAlreadyBuilt && otpServer.ec2Info.hasSeparateGraphBuildConfig())) {
141+
if (isBuildOnlyServer()) {
167142
status.completeSuccessfully(message);
168-
LOG.info("View logs at {}", getUserDataLogS3Path());
143+
LOG.info("View logs at {}", getOtpRunnerLogS3Path());
169144
return;
170145
}
171-
status.update("Loading graph...", 70);
172146
// Once this is confirmed, check for the availability of the router, which will indicate that the graph
173147
// load has completed successfully.
174148
String routerUrl = String.join("/", ipUrl, "otp/routers/default");
175149
long routerCheckStartTime = System.currentTimeMillis();
150+
boolean routerIsAvailable = false;
176151
while (!routerIsAvailable) {
177152
// If the request was successful, the graph build is complete!
178153
// TODO: Substitute in specific router ID? Or just default to... "default".
@@ -220,7 +195,7 @@ public void jobLogic() {
220195
routerUrl
221196
)
222197
);
223-
LOG.info("View logs at {}", getUserDataLogS3Path());
198+
LOG.info("View logs at {}", getOtpRunnerLogS3Path());
224199
deployJob.incrementCompletedServers();
225200
} catch (InstanceHealthException e) {
226201
// If at any point during the job, an instance health check indicates that the EC2 instance being monitored
@@ -232,19 +207,23 @@ public void jobLogic() {
232207
}
233208
}
234209

210+
private boolean isBuildOnlyServer() {
211+
return deployment.buildGraphOnly || (!graphAlreadyBuilt && otpServer.ec2Info.hasSeparateGraphBuildConfig());
212+
}
213+
235214
/**
236-
* Gets the expected path to the user data logs that get uploaded to s3
215+
* Gets the expected path to the otp-runner logs that get uploaded to s3
237216
*/
238-
private String getUserDataLogS3Path() {
239-
return String.format("%s/%s.log", deployJob.getS3FolderURI(), instance.getInstanceId());
217+
private String getOtpRunnerLogS3Path() {
218+
return String.format("%s/%s-otp-runner.log", deployJob.getS3FolderURI(), instance.getInstanceId());
240219
}
241220

242221
/**
243222
* Helper that fails with a helpful message about where to find uploaded logs.
244223
*/
245224
private void failJob(String message) {
246225
LOG.error(message);
247-
status.fail(String.format("%s Check logs at: %s", message, getUserDataLogS3Path()));
226+
status.fail(String.format("%s Check logs at: %s", message, getOtpRunnerLogS3Path()));
248227
}
249228

250229
/** Determine if a specific task has passed time limit for its run time. */
@@ -299,15 +278,27 @@ public InstanceHealthException(String instanceStateName) {
299278
}
300279
}
301280

302-
/** Make HTTP request to URL and return the string response. */
303-
private String getUrlAsString(String url) {
281+
private boolean checkForOtpRunnerCompletion(String url) {
304282
HttpGet httpGet = new HttpGet(url);
283+
OtpRunnerStatus otpRunnerStatus;
305284
try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
306-
return EntityUtils.toString(response.getEntity());
285+
otpRunnerStatus = JsonUtil.objectMapper.readValue(response.getEntity().getContent(), OtpRunnerStatus.class);
307286
} catch (IOException e) {
308-
LOG.error("Could not complete request to {}", url);
287+
LOG.error("Could not get otp-runner status from {}", url);
309288
e.printStackTrace();
310-
return null;
289+
return false;
290+
}
291+
if (otpRunnerStatus.error) {
292+
failJob(otpRunnerStatus.message);
293+
return false;
294+
}
295+
status.update(otpRunnerStatus.message, otpRunnerStatus.pctProgress);
296+
if (graphAlreadyBuilt || !isBuildOnlyServer()) {
297+
// server that finishes after OTP server is successfully started
298+
return otpRunnerStatus.serverStarted;
299+
} else {
300+
// server that finishes after graph is uploaded
301+
return otpRunnerStatus.graphUploaded;
311302
}
312303
}
313304

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package com.conveyal.datatools.manager.jobs;
2+
3+
import java.util.List;
4+
5+
/**
6+
* A mapping of the possible values of an otp-runner manifest. For further documentation please see otp-runner docs at
7+
* https://github.com/ibi-group/otp-runner#manifestjson-values
8+
*/
9+
public class OtpRunnerManifest {
10+
public String buildConfigJSON;
11+
public boolean buildGraph;
12+
public String buildLogFile;
13+
public String graphObjUrl;
14+
public String graphsFolder;
15+
public List<String> gtfsAndOsmUrls;
16+
public String jarFile;
17+
public String jarUrl;
18+
public String otpRunnerLogFile;
19+
public boolean prefixLogUploadsWithInstanceId;
20+
public String routerConfigJSON;
21+
public String routerName;
22+
public boolean runServer;
23+
public String s3UploadPath;
24+
public String serverLogFile;
25+
public int serverStartupTimeoutSeconds;
26+
public String statusFileLocation;
27+
public boolean uploadGraphBuildLogs;
28+
public boolean uploadGraphBuildReport;
29+
public boolean uploadGraph;
30+
public boolean uploadOtpRunnerLogs;
31+
public boolean uploadServerStartupLogs;
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package com.conveyal.datatools.manager.jobs;
2+
3+
/**
4+
* A mapping of the fields and values that otp-runner writes to a status file. See otp-runner documentation for more
5+
* info: https://github.com/ibi-group/otp-runner#statusjson
6+
*/
7+
public class OtpRunnerStatus {
8+
public boolean error;
9+
public boolean graphBuilt;
10+
public boolean graphUploaded;
11+
public boolean serverStarted;
12+
public String message;
13+
public int numFilesDownloaded;
14+
public double pctProgress;
15+
public int totalFilesToDownload;
16+
}

0 commit comments

Comments
 (0)