Skip to content

Commit 42b09b2

Browse files
authored
#4685 EventQueueGet shouldn't use llcorehttp's backoff-retry logic
- Event poll already has own backoff logic with 10 retries, using llcorehttp's one on top makes no sense. - Better logging coverage - Better in-code documentation - Add timing-based error detection
1 parent e33b5c5 commit 42b09b2

File tree

2 files changed

+70
-19
lines changed

2 files changed

+70
-19
lines changed

indra/newview/lleventpoll.cpp

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,6 @@ namespace Details
5454
void stop();
5555

5656
private:
57-
// We will wait RETRY_SECONDS + (errorCount * RETRY_SECONDS_INC) before retrying after an error.
58-
// This means we attempt to recover relatively quickly but back off giving more time to recover
59-
// until we finally give up after MAX_EVENT_POLL_HTTP_ERRORS attempts.
60-
static const F32 EVENT_POLL_ERROR_RETRY_SECONDS;
61-
static const F32 EVENT_POLL_ERROR_RETRY_SECONDS_INC;
62-
static const S32 MAX_EVENT_POLL_HTTP_ERRORS;
63-
6457
void eventPollCoro(std::string url);
6558

6659
void handleMessage(const LLSD &content);
@@ -76,9 +69,13 @@ namespace Details
7669
};
7770

7871

79-
const F32 LLEventPollImpl::EVENT_POLL_ERROR_RETRY_SECONDS = 15.f; // ~ half of a normal timeout.
80-
const F32 LLEventPollImpl::EVENT_POLL_ERROR_RETRY_SECONDS_INC = 5.f; // ~ half of a normal timeout.
81-
const S32 LLEventPollImpl::MAX_EVENT_POLL_HTTP_ERRORS = 10; // ~5 minutes, by the above rules.
72+
// We will wait RETRY_SECONDS + (errorCount * RETRY_SECONDS_INC) before retrying after an error.
73+
// This means we attempt to recover relatively quickly but back off giving more time to recover
74+
// until we finally give up after MAX_EVENT_POLL_HTTP_ERRORS attempts.
75+
constexpr F32 EVENT_POLL_ERROR_RETRY_SECONDS = 15.f; // ~ half of a normal timeout.
76+
constexpr F32 EVENT_POLL_ERROR_RETRY_SECONDS_INC = 5.f; // ~ half of a normal timeout.
77+
constexpr S32 MAX_EVENT_POLL_HTTP_ERRORS = 10; // ~5 minutes, by the above rules.
78+
constexpr F64 MIN_SECONDS_PASSED = 10.0; // Minimum time we expect the server to hold the request.
8279

8380
int LLEventPollImpl::sNextCounter = 1;
8481

@@ -151,11 +148,17 @@ namespace Details
151148
LLSD acknowledge;
152149
int errorCount = 0;
153150
int counter = mCounter; // saved on the stack for logging.
151+
LLTimer message_time;
154152

155153
LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> entering coroutine." << LL_ENDL;
156154

157155
mAdapter = httpAdapter;
158156

157+
// This is a loop with its own waitToRetry implementation,
158+
// so disable retries.
159+
LLCore::HttpOptions::ptr_t httpOpts(new LLCore::HttpOptions);
160+
httpOpts->setRetries(0);
161+
159162
LL::WorkQueue::ptr_t main_queue = nullptr;
160163

161164
// HACK -- grab the mainloop workqueue to move execution of the handler
@@ -172,11 +175,13 @@ namespace Details
172175
request["ack"] = acknowledge;
173176
request["done"] = mDone;
174177

178+
message_time.reset();
179+
175180
// LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> request = "
176181
// << LLSDXMLStreamer(request) << LL_ENDL;
177182

178183
LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> posting and yielding." << LL_ENDL;
179-
LLSD result = httpAdapter->postAndSuspend(mHttpRequest, url, request);
184+
LLSD result = httpAdapter->postAndSuspend(mHttpRequest, url, request, httpOpts);
180185

181186
// LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> result = "
182187
// << LLSDXMLStreamer(result) << LL_ENDL;
@@ -194,25 +199,44 @@ namespace Details
194199

195200
if (!status)
196201
{
197-
if (status == LLCore::HttpStatus(LLCore::HttpStatus::EXT_CURL_EASY, CURLE_OPERATION_TIMEDOUT))
198-
{ // A standard timeout response we get this when there are no events.
199-
LL_DEBUGS("LLEventPollImpl") << "All is very quiet on target server. It may have gone idle?" << LL_ENDL;
200-
errorCount = 0;
201-
continue;
202+
if (status == LLCore::HttpStatus(LLCore::HttpStatus::EXT_CURL_EASY, CURLE_OPERATION_TIMEDOUT) // A standard timeout, no events.
203+
|| status == LLCore::HttpStatus(HTTP_BAD_GATEWAY) // An expected 'No events' case.
204+
|| status == LLCore::HttpStatus(HTTP_INTERNAL_ERROR)
205+
|| status == LLCore::HttpStatus(HTTP_SERVICE_UNAVAILABLE)
206+
|| status == LLCore::HttpStatus(HTTP_GATEWAY_TIME_OUT))
207+
{
208+
if (message_time.getElapsedSeconds() < MIN_SECONDS_PASSED)
209+
{
210+
// Server is supposed to hold request for 20 to 30 seconds.
211+
// If it didn't hold the request at least for 10s, treat as an error.
212+
LL_WARNS("LLEventPollImpl") << "Response arrived too early, status: " << status.toTerseString()
213+
<< ", time passed: " << message_time.getElapsedSeconds() << LL_ENDL;
214+
}
215+
else
216+
{
217+
// Timeout, expected and means 'no events'. Request is to be re-issued immediately.
218+
// Current definition of a timeout is any of :
219+
// - libcurl easy 28 status code
220+
// - Linden 499 special http status code
221+
// - RFC - standard 502 - 504 http status codes
222+
LL_DEBUGS("LLEventPollImpl") << "No events, from: " << mSenderIp <<" status: " << (S32)status.getStatus() << LL_ENDL;
223+
errorCount = 0;
224+
continue;
225+
}
202226
}
203227
else if ((status == LLCore::HttpStatus(LLCore::HttpStatus::LLCORE, LLCore::HE_OP_CANCELED)) ||
204228
(status == LLCore::HttpStatus(HTTP_NOT_FOUND)))
205229
{ // Event polling for this server has been canceled. In
206230
// some cases the server gets ahead of the viewer and will
207231
// return a 404 error (Not Found) before the cancel event
208232
// comes back in the queue
209-
LL_WARNS("LLEventPollImpl") << "Canceling coroutine" << LL_ENDL;
233+
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Canceling coroutine, status: " << status.toTerseString() << LL_ENDL;
210234
break;
211235
}
212236
else if (!status.isHttpStatus())
213237
{
214238
/// Some LLCore or LIBCurl error was returned. This is unlikely to be recoverable
215-
LL_WARNS("LLEventPollImpl") << "Critical error from poll request returned from libraries. Canceling coroutine." << LL_ENDL;
239+
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Critical error from poll request returned from libraries. Canceling coroutine." << LL_ENDL;
216240
break;
217241
}
218242
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Error result from LLCoreHttpUtil::HttpCoroHandler. Code "
@@ -255,6 +279,10 @@ namespace Details
255279
LL_WARNS("LLEventPollImpl") << "< " << counter << "> Forcing disconnect due to stalled main region event poll." << LL_ENDL;
256280
LLAppViewer::instance()->forceDisconnect(LLTrans::getString("AgentLostConnection"));
257281
}
282+
else
283+
{
284+
LL_WARNS("LLEventPollImpl") << "< " << counter << "> Stopping event poll for " << mSenderIp << " due to failures." << LL_ENDL;
285+
}
258286
break;
259287
}
260288
}

indra/newview/lleventpoll.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,30 @@ namespace Details
4040

4141

4242
class LLEventPoll
43-
///< implements the viewer side of server-to-viewer pushed events.
43+
///< Implements the viewer side of server-to-viewer pushed events.
44+
///
45+
/// This class implements the sole consumer of the EventQueueGet capability
46+
/// and delivers data, including llsd-encoded llmessage payloads, from
47+
/// simulator to viewer.
48+
///
49+
/// https://wiki.secondlife.com/wiki/EventQueueGet
50+
/// The wiki page is neither complete nor entirely correct. Request timeouts
51+
/// became the de facto method of returning an empty event set to the viewer.
52+
/// But the timeout behavior was never defined. It was simply whatever
53+
/// behavior a given grid implementation implemented.
54+
///
55+
/// In SL's case, the path may include reverse proxies, http caches, http and
56+
/// socks proxies, transparent hijacking, and other horrors. A pitfall for
57+
/// implementors.
58+
///
59+
/// Current definition of a timeout is any of :
60+
/// - libcurl easy 28 status code
61+
/// - Linden 499 special http status code
62+
/// - RFC - standard 502 - 504 http status codes
63+
/// If requests are failing too quickly with the above errors, they are treated
64+
/// as actual errors and not an empty payload. These will count towards a final
65+
/// error declaration and can lead to disconnection from a simulator or the
66+
/// entire grid.
4467
{
4568
public:
4669
LLEventPoll(const std::string& pollURL, const LLHost& sender);

0 commit comments

Comments
 (0)