22
22
import com .conveyal .datatools .manager .auth .Auth0UserProfile ;
23
23
import com .conveyal .datatools .manager .models .Deployment ;
24
24
import com .conveyal .datatools .manager .models .OtpServer ;
25
+ import com .conveyal .datatools .manager .utils .json .JsonUtil ;
25
26
import com .fasterxml .jackson .annotation .JsonProperty ;
26
27
import org .apache .http .HttpEntity ;
27
28
import org .apache .http .client .methods .CloseableHttpResponse ;
35
36
import java .io .IOException ;
36
37
import java .util .Collections ;
37
38
38
- import static com .conveyal .datatools .manager .jobs .DeployJob .BUNDLE_DOWNLOAD_COMPLETE_FILE ;
39
- import static com .conveyal .datatools .manager .jobs .DeployJob .GRAPH_STATUS_FILE ;
39
+ import static com .conveyal .datatools .manager .jobs .DeployJob .OTP_RUNNER_STATUS_FILE ;
40
40
41
41
/**
42
42
* Job that is dispatched during a {@link DeployJob} that spins up EC2 instances. This handles waiting for the server to
@@ -53,9 +53,9 @@ public class MonitorServerStatusJob extends MonitorableJob {
53
53
private final AmazonEC2 ec2 ;
54
54
private final AmazonElasticLoadBalancing elbClient ;
55
55
private final CloseableHttpClient httpClient = HttpClients .createDefault ();
56
- // Delay checks by twenty seconds to give user-data script time to upload the instance's user data log if part of the
56
+ // Delay checks by four seconds to give user-data script time to upload the instance's user data log if part of the
57
57
// script fails (e.g., uploading or downloading a file).
58
- private static final int DELAY_SECONDS = 20 ;
58
+ private static final int DELAY_SECONDS = 4 ;
59
59
public long graphTaskSeconds ;
60
60
61
61
public MonitorServerStatusJob (Auth0UserProfile owner , DeployJob deployJob , Instance instance , boolean graphAlreadyBuilt ) {
@@ -93,86 +93,61 @@ public String getDeploymentId () {
93
93
94
94
@ Override
95
95
public void jobLogic () {
96
- String ipUrl = "http://" + instance .getPublicIpAddress ();
97
96
// Get OTP URL for instance to check for availability.
98
- boolean routerIsAvailable = false , graphIsAvailable = false ;
97
+ String ipUrl = "http://" + instance . getPublicIpAddress () ;
99
98
if (otpServer .ec2Info == null || otpServer .ec2Info .targetGroupArn == null ) {
100
99
// Fail the job from the outset if there is no target group defined.
101
100
failJob ("There is no load balancer under which to register ec2 instance." );
102
101
}
103
102
try {
104
- if (graphAlreadyBuilt ) {
105
- // If graph already build, instance's user data will download Graph.obj automatically instead of bundle.
106
- status .update ("Loading graph..." , 40 );
107
- } else {
108
- // Otherwise, we need to verify that the bundle downloaded successfully.
109
- boolean bundleIsDownloaded = false ;
110
- // Progressively check status of OTP server
111
- if (deployment .buildGraphOnly ) {
112
- // No need to check that OTP is running. Just check to see that the graph is built.
113
- routerIsAvailable = true ;
114
- }
115
- // First, check that OTP has started up.
116
- status .update ("Prepping for graph build..." , 20 );
117
- String bundleUrl = String .join ("/" , ipUrl , BUNDLE_DOWNLOAD_COMPLETE_FILE );
118
- long bundleDownloadStartTime = System .currentTimeMillis ();
119
- while (!bundleIsDownloaded ) {
120
- // If the request is successful, the OTP instance has started.
121
- waitAndCheckInstanceHealth ("bundle download check: " + bundleUrl );
122
- bundleIsDownloaded = checkForSuccessfulRequest (bundleUrl );
123
- // wait 20 minutes max for the bundle to download
124
- long maxBundleDownloadTimeMillis = 20 * 60 * 1000 ;
125
- if (taskHasTimedOut (bundleDownloadStartTime , maxBundleDownloadTimeMillis )) {
126
- failJob ("Job timed out while checking for server bundle download status." );
127
- return ;
128
- }
129
- }
130
- // Check status of bundle download and fail job if there was a failure.
131
- String bundleStatus = getUrlAsString (bundleUrl );
132
- if (bundleStatus == null || !bundleStatus .contains ("SUCCESS" )) {
133
- failJob ("Failure encountered while downloading transit bundle." );
103
+ // Wait for otp-runner to produce first status file
104
+ long statusCheckStartTime = System .currentTimeMillis ();
105
+ String statusUrl = String .join ("/" , ipUrl , OTP_RUNNER_STATUS_FILE );
106
+ boolean otpRunnerStatusAvailable = false ;
107
+ while (!otpRunnerStatusAvailable ) {
108
+ // If the request is successful, the OTP instance has started.
109
+ waitAndCheckInstanceHealth ("otp-runner status file availability check: " + statusUrl );
110
+ otpRunnerStatusAvailable = checkForSuccessfulRequest (statusUrl );
111
+ long maxOtpRunnerStartupTimeMillis = 5 * 60 * 1000 ;
112
+ if (taskHasTimedOut (statusCheckStartTime , maxOtpRunnerStartupTimeMillis )) {
113
+ failJob ("Job timed out while waiting for otp-runner to produce a status file!" );
134
114
return ;
135
115
}
136
- long bundleDownloadSeconds = (System .currentTimeMillis () - bundleDownloadStartTime ) / 1000 ;
137
- LOG .info ("Bundle downloaded in {} seconds!" , bundleDownloadSeconds );
138
- status .update ("Building graph..." , 30 );
139
116
}
140
- // Once bundle is downloaded, we await the build (or download if graph already built) of the graph.
141
- long graphCheckStartTime = System .currentTimeMillis ();
142
- String graphStatusUrl = String . join ( "/" , ipUrl , GRAPH_STATUS_FILE ) ;
143
- while (!graphIsAvailable ) {
117
+ // Wait for otp-runner to write a status that fulfills expectations of this job
118
+ statusCheckStartTime = System .currentTimeMillis ();
119
+ boolean otpRunnerCompleted = false ;
120
+ while (!otpRunnerCompleted ) {
144
121
// If the request is successful, the OTP instance has started.
145
- waitAndCheckInstanceHealth ("graph build/download check: " + graphStatusUrl );
146
- graphIsAvailable = checkForSuccessfulRequest (graphStatusUrl );
147
- // wait a maximum of 4 hours if building the graph, or 20 minutes if downloading a graph
148
- long maxGraphBuildOrDownloadWaitTimeMillis = graphAlreadyBuilt ? 20 * 60 * 1000 : 4 * 60 * 60 * 1000 ;
149
- if (taskHasTimedOut (graphCheckStartTime , maxGraphBuildOrDownloadWaitTimeMillis )) {
150
- failJob ("Job timed out while waiting for graph build/download. If this was a graph building machine, it may have run out of memory." );
122
+ waitAndCheckInstanceHealth ("otp-runner completion check: " + statusUrl );
123
+ otpRunnerCompleted = checkForOtpRunnerCompletion (statusUrl );
124
+ // Check if an otp-runner status file check has already failed this job.
125
+ if (status .error ) {
126
+ return ;
127
+ }
128
+ // wait a maximum of 5 hours if building a graph, or 1 hour if just starting a server
129
+ long maxOtpRunnerWaitTimeMillis = (graphAlreadyBuilt ? 5 : 1 ) * 60 * 60 * 1000 ;
130
+ if (taskHasTimedOut (statusCheckStartTime , maxOtpRunnerWaitTimeMillis )) {
131
+ failJob ("Job timed out while waiting for otp-runner to finish!" );
151
132
return ;
152
133
}
153
134
}
154
- // Check graph status and fail job if there was a failure.
155
- String graphStatus = getUrlAsString (graphStatusUrl );
156
- if (graphStatus == null || !graphStatus .contains ("SUCCESS" )) {
157
- failJob ("Failure encountered while building/downloading graph." );
158
- return ;
159
- }
160
- graphTaskSeconds = (System .currentTimeMillis () - graphCheckStartTime ) / 1000 ;
135
+ graphTaskSeconds = (System .currentTimeMillis () - statusCheckStartTime ) / 1000 ;
161
136
String message = String .format ("Graph build/download completed in %d seconds!" , graphTaskSeconds );
162
137
LOG .info (message );
163
138
// If only task for this instance is to build the graph (either because that is the deployment purpose or
164
139
// because this instance type/image is for graph building only), this machine's job is complete and we can
165
140
// consider this job done.
166
- if (deployment . buildGraphOnly || (! graphAlreadyBuilt && otpServer . ec2Info . hasSeparateGraphBuildConfig () )) {
141
+ if (isBuildOnlyServer ( )) {
167
142
status .completeSuccessfully (message );
168
- LOG .info ("View logs at {}" , getUserDataLogS3Path ());
143
+ LOG .info ("View logs at {}" , getOtpRunnerLogS3Path ());
169
144
return ;
170
145
}
171
- status .update ("Loading graph..." , 70 );
172
146
// Once this is confirmed, check for the availability of the router, which will indicate that the graph
173
147
// load has completed successfully.
174
148
String routerUrl = String .join ("/" , ipUrl , "otp/routers/default" );
175
149
long routerCheckStartTime = System .currentTimeMillis ();
150
+ boolean routerIsAvailable = false ;
176
151
while (!routerIsAvailable ) {
177
152
// If the request was successful, the graph build is complete!
178
153
// TODO: Substitute in specific router ID? Or just default to... "default".
@@ -220,7 +195,7 @@ public void jobLogic() {
220
195
routerUrl
221
196
)
222
197
);
223
- LOG .info ("View logs at {}" , getUserDataLogS3Path ());
198
+ LOG .info ("View logs at {}" , getOtpRunnerLogS3Path ());
224
199
deployJob .incrementCompletedServers ();
225
200
} catch (InstanceHealthException e ) {
226
201
// If at any point during the job, an instance health check indicates that the EC2 instance being monitored
@@ -232,19 +207,23 @@ public void jobLogic() {
232
207
}
233
208
}
234
209
210
+ private boolean isBuildOnlyServer () {
211
+ return deployment .buildGraphOnly || (!graphAlreadyBuilt && otpServer .ec2Info .hasSeparateGraphBuildConfig ());
212
+ }
213
+
235
214
/**
236
- * Gets the expected path to the user data logs that get uploaded to s3
215
+ * Gets the expected path to the otp-runner logs that get uploaded to s3
237
216
*/
238
- private String getUserDataLogS3Path () {
239
- return String .format ("%s/%s.log" , deployJob .getS3FolderURI (), instance .getInstanceId ());
217
+ private String getOtpRunnerLogS3Path () {
218
+ return String .format ("%s/%s-otp-runner .log" , deployJob .getS3FolderURI (), instance .getInstanceId ());
240
219
}
241
220
242
221
/**
243
222
* Helper that fails with a helpful message about where to find uploaded logs.
244
223
*/
245
224
private void failJob (String message ) {
246
225
LOG .error (message );
247
- status .fail (String .format ("%s Check logs at: %s" , message , getUserDataLogS3Path ()));
226
+ status .fail (String .format ("%s Check logs at: %s" , message , getOtpRunnerLogS3Path ()));
248
227
}
249
228
250
229
/** Determine if a specific task has passed time limit for its run time. */
@@ -299,15 +278,27 @@ public InstanceHealthException(String instanceStateName) {
299
278
}
300
279
}
301
280
302
- /** Make HTTP request to URL and return the string response. */
303
- private String getUrlAsString (String url ) {
281
+ private boolean checkForOtpRunnerCompletion (String url ) {
304
282
HttpGet httpGet = new HttpGet (url );
283
+ OtpRunnerStatus otpRunnerStatus ;
305
284
try (CloseableHttpResponse response = httpClient .execute (httpGet )) {
306
- return EntityUtils . toString (response .getEntity ());
285
+ otpRunnerStatus = JsonUtil . objectMapper . readValue (response .getEntity (). getContent (), OtpRunnerStatus . class );
307
286
} catch (IOException e ) {
308
- LOG .error ("Could not complete request to {}" , url );
287
+ LOG .error ("Could not get otp-runner status from {}" , url );
309
288
e .printStackTrace ();
310
- return null ;
289
+ return false ;
290
+ }
291
+ if (otpRunnerStatus .error ) {
292
+ failJob (otpRunnerStatus .message );
293
+ return false ;
294
+ }
295
+ status .update (otpRunnerStatus .message , otpRunnerStatus .pctProgress );
296
+ if (graphAlreadyBuilt || !isBuildOnlyServer ()) {
297
+ // server that finishes after OTP server is successfully started
298
+ return otpRunnerStatus .serverStarted ;
299
+ } else {
300
+ // server that finishes after graph is uploaded
301
+ return otpRunnerStatus .graphUploaded ;
311
302
}
312
303
}
313
304
0 commit comments