Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RUM-8042 Replace Upload Quality with Batch Blocked telemetry #2230

Draft
wants to merge 2 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Datadog/Datadog.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -940,8 +940,8 @@
D22743E729DEB953001A7EF9 /* UIApplicationSwizzlerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 61410166251A661D00E3C2D9 /* UIApplicationSwizzlerTests.swift */; };
D22743EB29DEC9E6001A7EF9 /* Casting+RUM.swift in Sources */ = {isa = PBXBuildFile; fileRef = 61411B0F24EC15AC0012EAB2 /* Casting+RUM.swift */; };
D22743EC29DEC9E6001A7EF9 /* Casting+RUM.swift in Sources */ = {isa = PBXBuildFile; fileRef = 61411B0F24EC15AC0012EAB2 /* Casting+RUM.swift */; };
D22789362D64A0D7007E9DB0 /* UploadQualityMetric.swift in Sources */ = {isa = PBXBuildFile; fileRef = D22789352D64A0D3007E9DB0 /* UploadQualityMetric.swift */; };
D22789372D64A0D7007E9DB0 /* UploadQualityMetric.swift in Sources */ = {isa = PBXBuildFile; fileRef = D22789352D64A0D3007E9DB0 /* UploadQualityMetric.swift */; };
D22789362D64A0D7007E9DB0 /* UploadCycleMetric.swift in Sources */ = {isa = PBXBuildFile; fileRef = D22789352D64A0D3007E9DB0 /* UploadCycleMetric.swift */; };
D22789372D64A0D7007E9DB0 /* UploadCycleMetric.swift in Sources */ = {isa = PBXBuildFile; fileRef = D22789352D64A0D3007E9DB0 /* UploadCycleMetric.swift */; };
D227A0A42C7622EA00C83324 /* BenchmarkProfiler.swift in Sources */ = {isa = PBXBuildFile; fileRef = D227A0A32C7622EA00C83324 /* BenchmarkProfiler.swift */; };
D227A0A52C7622EA00C83324 /* BenchmarkProfiler.swift in Sources */ = {isa = PBXBuildFile; fileRef = D227A0A32C7622EA00C83324 /* BenchmarkProfiler.swift */; };
D22C5BC82A98A0B20024CC1F /* Baggages.swift in Sources */ = {isa = PBXBuildFile; fileRef = D22C5BC52A989D130024CC1F /* Baggages.swift */; };
Expand Down Expand Up @@ -2903,7 +2903,7 @@
D2216EC22A96632F00ADAEC8 /* FeatureBaggageTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeatureBaggageTests.swift; sourceTree = "<group>"; };
D22442C42CA301DA002E71E4 /* UIColor+SessionReplay.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "UIColor+SessionReplay.swift"; sourceTree = "<group>"; };
D224430C29E95D6600274EC7 /* CrashReportReceiverTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = CrashReportReceiverTests.swift; sourceTree = "<group>"; };
D22789352D64A0D3007E9DB0 /* UploadQualityMetric.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UploadQualityMetric.swift; sourceTree = "<group>"; };
D22789352D64A0D3007E9DB0 /* UploadCycleMetric.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UploadCycleMetric.swift; sourceTree = "<group>"; };
D227A0A32C7622EA00C83324 /* BenchmarkProfiler.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BenchmarkProfiler.swift; sourceTree = "<group>"; };
D22C1F5B271484B400922024 /* LogEventMapper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LogEventMapper.swift; sourceTree = "<group>"; };
D22C5BC52A989D130024CC1F /* Baggages.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Baggages.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -5048,7 +5048,7 @@
children = (
6174D60B2BFDDEDF00EC7469 /* SDKMetricFields.swift */,
A7FA98CD2BA1A6930018D6B5 /* MethodCalledMetric.swift */,
D22789352D64A0D3007E9DB0 /* UploadQualityMetric.swift */,
D22789352D64A0D3007E9DB0 /* UploadCycleMetric.swift */,
);
path = SDKMetrics;
sourceTree = "<group>";
Expand Down Expand Up @@ -8573,7 +8573,7 @@
D2160CA229C0DE5700FAA9A5 /* NetworkInstrumentationFeature.swift in Sources */,
D2EBEE1F29BA160F00B15732 /* HTTPHeadersReader.swift in Sources */,
E2AA55E72C32C6D9002FEF28 /* ApplicationNotifications.swift in Sources */,
D22789372D64A0D7007E9DB0 /* UploadQualityMetric.swift in Sources */,
D22789372D64A0D7007E9DB0 /* UploadCycleMetric.swift in Sources */,
D263BCAF29DAFFEB00FA0E21 /* PerformancePresetOverride.swift in Sources */,
D23039E7298D5236001A1FA3 /* NetworkConnectionInfo.swift in Sources */,
D23039E9298D5236001A1FA3 /* TrackingConsent.swift in Sources */,
Expand Down Expand Up @@ -9616,7 +9616,7 @@
D2160CA329C0DE5700FAA9A5 /* NetworkInstrumentationFeature.swift in Sources */,
D2EBEE2D29BA161100B15732 /* HTTPHeadersReader.swift in Sources */,
E2AA55E82C32C6D9002FEF28 /* ApplicationNotifications.swift in Sources */,
D22789362D64A0D7007E9DB0 /* UploadQualityMetric.swift in Sources */,
D22789362D64A0D7007E9DB0 /* UploadCycleMetric.swift in Sources */,
D263BCB029DAFFEB00FA0E21 /* PerformancePresetOverride.swift in Sources */,
D2DA2359298D57AA00C6C7E6 /* NetworkConnectionInfo.swift in Sources */,
D2DA235A298D57AA00C6C7E6 /* TrackingConsent.swift in Sources */,
Expand Down
94 changes: 49 additions & 45 deletions DatadogCore/Sources/Core/Upload/DataUploadWorker.swift
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I read this correctly, we use to send sendUploadQualityMetric both in cases of success and failure. Now we send sendUploadCycleMetric before every upload, and track failures with sendBatchBlockedMetric. So we don't explicitly send a telemetry in case of success. I imagine we can still graph the success rate, but is this intentional?

Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,23 @@ internal class DataUploadWorker: DataUploadWorkerType {
let context = contextProvider.read()
let blockersForUpload = uploadConditions.blockersForUpload(with: context)
let isSystemReady = blockersForUpload.isEmpty
let files = isSystemReady ? fileReader.readFiles(limit: maxBatchesPerUpload) : nil
if let files = files, !files.isEmpty {
let files = fileReader.readFiles(limit: maxBatchesPerUpload)

if !files.isEmpty && isSystemReady {
Comment on lines +76 to +78
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we try to asses if this change might have a negative performance impact ?
I don't think it would because contentsOfDirectory (urderlying Filesystem method used) only get URLs, and not the content of each file, but maybe it's worth checking using maxBatchesPerUpload files everytime ?
Since we only use files to get their count for the telemetry, it would be bad if this also had a performance impact.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The files with their InputStream will be needed for the self.uploadFile(from: files.reversed(), context: context)

DD.logger.debug("⏳ (\(self.featureName)) Uploading batches...")
self.backgroundTaskCoordinator?.beginBackgroundTask()
self.uploadFile(from: files.reversed(), context: context)
sendUploadCycleMetric()
} else {
let batchLabel = files?.isEmpty == false ? "YES" : (isSystemReady ? "NO" : "NOT CHECKED")
let batchLabel = files.isEmpty ? "NO" : "YES"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I imagine the answer is no, but does not sending "NOT CHECKED" anymore have any consequences on our metrics?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it doesn't, it was just for console printing. The drawback of this change is that we will check for batches even if the upload is blocked, but the impact is minimal as we only list the files.

DD.logger.debug("💡 (\(self.featureName)) No upload. Batch to upload: \(batchLabel), System conditions: \(blockersForUpload.description)")
self.delay.increase()
self.backgroundTaskCoordinator?.endBackgroundTask()
self.scheduleNextCycle()
sendUploadQualityMetric(blockers: blockersForUpload)
sendBatchBlockedMetric(blockers: blockersForUpload, batchCount: files.count)
}
}

self.readWork = readWorkItem

// Start sending batches immediately after initialization:
Expand All @@ -106,6 +109,7 @@ internal class DataUploadWorker: DataUploadWorkerType {
return
}

let filesCount = files.count
var files = files
guard let file = files.popLast() else {
self.scheduleNextCycle()
Expand All @@ -121,12 +125,12 @@ internal class DataUploadWorker: DataUploadWorkerType {
)

previousUploadStatus = uploadStatus
sendUploadQualityMetric(status: uploadStatus)

if uploadStatus.needsRetry {
DD.logger.debug(" → (\(self.featureName)) not delivered, will be retransmitted: \(uploadStatus.userDebugDescription)")
self.delay.increase()
self.scheduleNextCycle()
sendBatchBlockedMetric(status: uploadStatus, batchCount: filesCount)
return
}

Expand All @@ -147,8 +151,7 @@ internal class DataUploadWorker: DataUploadWorkerType {
previousUploadStatus = nil

if let error = uploadStatus.error {
// Throw to report the request error accordingly
throw error
throw error // Throw to report the request error accordingly
}
} catch DataUploadError.httpError(statusCode: .unauthorized), DataUploadError.httpError(statusCode: .forbidden) {
DD.logger.error("⚠️ Make sure that the provided token still exists and you're targeting the relevant Datadog site.")
Expand All @@ -166,7 +169,6 @@ internal class DataUploadWorker: DataUploadWorkerType {
self.fileReader.markBatchAsRead(batch, reason: .invalid)
previousUploadStatus = nil
self.telemetry.error("Failed to initiate '\(self.featureName)' data upload", error: error)
sendUploadQualityMetric(failure: "invalid")
}
}

Expand Down Expand Up @@ -235,55 +237,57 @@ internal class DataUploadWorker: DataUploadWorkerType {
}
}

private func sendUploadQualityMetric(blockers: [DataUploadConditions.Blocker]) {
guard !blockers.isEmpty else {
return sendUploadQualityMetric()
}

sendUploadQualityMetric(
failure: "blocker",
blockers: blockers.map {
switch $0 {
case .battery: return "low_battery"
case .lowPowerModeOn: return "lpm"
case .networkReachability: return "offline"
}
}
private func sendUploadCycleMetric() {
telemetry.metric(
name: UploadCycleMetric.name,
attributes: [UploadCycleMetric.track: featureName]
)
}

private func sendUploadQualityMetric(status: DataUploadStatus) {
guard let error = status.error else {
return sendUploadQualityMetric()
private func sendBatchBlockedMetric(blockers: [DataUploadConditions.Blocker], batchCount: Int) {
guard !blockers.isEmpty else {
return
}

sendUploadQualityMetric(
failure: {
switch error {
case let .httpError(code): return "\(code)"
case let .networkError(error): return "\(error.code)"
}
}()
)
}

private func sendUploadQualityMetric() {
telemetry.metric(
name: UploadQualityMetric.name,
name: BatchBlockedMetric.name,
attributes: [
UploadQualityMetric.track: featureName
]
SDKMetricFields.typeKey: BatchBlockedMetric.typeValue,
BatchMetric.trackKey: featureName,
BatchBlockedMetric.uploaderDelayKey: delay.current,
BatchBlockedMetric.batchCount: batchCount,
BatchBlockedMetric.blockers: blockers.map {
switch $0 {
case .battery: return "low_battery"
case .lowPowerModeOn: return "lpm"
case .networkReachability: return "offline"
}
}
],
sampleRate: BatchBlockedMetric.sampleRate
)
}

private func sendUploadQualityMetric(failure: String, blockers: [String] = []) {
private func sendBatchBlockedMetric(status: DataUploadStatus, batchCount: Int) {
guard let error = status.error else {
return
}

telemetry.metric(
name: UploadQualityMetric.name,
name: BatchBlockedMetric.name,
attributes: [
UploadQualityMetric.track: featureName,
UploadQualityMetric.failure: failure,
UploadQualityMetric.blockers: blockers
]
SDKMetricFields.typeKey: BatchBlockedMetric.typeValue,
BatchMetric.trackKey: featureName,
BatchBlockedMetric.uploaderDelayKey: delay.current,
BatchBlockedMetric.batchCount: batchCount,
BatchBlockedMetric.failure: {
switch error {
case let .httpError(code): return "intake-code-\(code.rawValue)"
case let .networkError(error): return "network-code-\(error.code)"
}
}()
],
sampleRate: BatchBlockedMetric.sampleRate
)
}
}
Expand Down
20 changes: 20 additions & 0 deletions DatadogCore/Sources/SDKMetrics/BatchMetrics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,23 @@ internal enum BatchClosedMetric {
/// If the batch was closed by core or after new batch was forced by the feature.
static let forcedNewKey = "forced_new"
}

/// Definition of "Batch Blocked" telemetry.
internal enum BatchBlockedMetric {
/// The name of this metric, included in telemetry log.
/// Note: the "[Mobile Metric]" prefix is added when sending this telemetry in RUM.
static let name = "Batch Blocked"
/// Metric type value.
static let typeValue = "batch blocked"
/// The sample rate for this metric.
/// It is applied in addition to the telemetry sample rate (20% by default).
static let sampleRate: Float = 1.5 // 1.5%
/// The key for uploader's current delay.
static let uploaderDelayKey = "uploader_delay.current"
/// The key for count of bacthes being blocked.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: The key for the number of batches being blocked.

static let batchCount = "batch_count"

/// List of upload blockers
static let blockers = "blockers"
static let failure = "failure"
}
76 changes: 52 additions & 24 deletions DatadogCore/Tests/Datadog/Core/Upload/DataUploadWorkerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,8 @@ class DataUploadWorkerTests: XCTestCase {
XCTAssertEqual(try orchestrator.directory.files().count, 0)

XCTAssertEqual(telemetry.messages.count, 3)
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_cycle"), "An upload cycle metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["track"] as? String, featureName)
XCTAssertNil(metric.attributes["failure"])
XCTAssertNil(metric.attributes["blockers"])
}

func testItUploadsDataSequentiallyWithoutDelay_whenMaxBatchesPerUploadIsSet() throws {
Expand Down Expand Up @@ -141,11 +139,9 @@ class DataUploadWorkerTests: XCTestCase {
worker.cancelSynchronously()
XCTAssertEqual(try orchestrator.directory.files().count, 1)

XCTAssertEqual(telemetry.messages.count, 2)
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
XCTAssertEqual(telemetry.messages.count, 1)
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_cycle"), "An upload cycle metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["track"] as? String, featureName)
XCTAssertNil(metric.attributes["failure"])
XCTAssertNil(metric.attributes["blockers"])
}

func testGivenDataToUpload_whenUploadFinishesAndDoesNotNeedToBeRetried_thenDataIsDeleted() {
Expand Down Expand Up @@ -526,7 +522,7 @@ class DataUploadWorkerTests: XCTestCase {
)
}

func testWhenUploadIsBlocked_itDoesSendUploadQualityTelemetry() throws {
func testWhenUploadIsBlocked_itDoesSendBatchBlockedTelemetry() throws {
// Given
let telemetry = TelemetryMock()

Expand Down Expand Up @@ -566,9 +562,9 @@ class DataUploadWorkerTests: XCTestCase {

// Then
XCTAssertEqual(telemetry.messages.count, 1)
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["failure"] as? String, "blocker")
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "Batch Blocked"), "A Batch blocked metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["blockers"] as? [String], ["offline", "low_battery"])
XCTAssertNil(metric.attributes["failure"])
XCTAssertEqual(metric.attributes["track"] as? String, featureName)
}

Expand Down Expand Up @@ -614,9 +610,7 @@ class DataUploadWorkerTests: XCTestCase {

// Then
XCTAssertEqual(telemetry.messages.count, 1)
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["failure"] as? String, "\(randomStatusCode)")
XCTAssertEqual(metric.attributes["track"] as? String, featureName)
XCTAssertNotNil(telemetry.messages.firstMetric(named: "upload_cycle"), "An upload cycle metric should be send to `telemetry`.")
}

func testWhenDataIsUploadedWithAlertingStatusCode_itSendsErrorTelemetry() throws {
Expand Down Expand Up @@ -660,14 +654,9 @@ class DataUploadWorkerTests: XCTestCase {
worker.cancelSynchronously()

// Then
XCTAssertEqual(telemetry.messages.count, 2)

let error = try XCTUnwrap(telemetry.messages.firstError(), "An error should be send to `telemetry`.")
XCTAssertEqual(error.message,"Data upload finished with status code: \(randomStatusCode.rawValue)")

let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["failure"] as? String, "\(randomStatusCode)")
XCTAssertEqual(metric.attributes["track"] as? String, featureName)
}

func testWhenDataCannotBeUploadedDueToNetworkError_itSendsErrorTelemetry() throws {
Expand Down Expand Up @@ -705,13 +694,53 @@ class DataUploadWorkerTests: XCTestCase {
worker.cancelSynchronously()

// Then
XCTAssertEqual(telemetry.messages.count, 2)

let error = try XCTUnwrap(telemetry.messages.firstError(), "An error should be send to `telemetry`.")
XCTAssertEqual(error.message, #"Data upload finished with error - Error Domain=abc Code=0 "(null)""#)
}

func testWhenDataIsUploadedWithRetryableStatusCode_itSendsBatchBlockedTelemetry() throws {
// Given
let telemetry = TelemetryMock()

writer.write(value: ["key": "value"])
let randomStatusCode: HTTPResponseStatusCode = [
.requestTimeout,
.tooManyRequests,
.internalServerError,
.serviceUnavailable
].randomElement()!

// When
let startUploadExpectation = self.expectation(description: "Upload has started")
let mockDataUploader = DataUploaderMock(
uploadStatus: .mockWith(needsRetry: true, error: .httpError(statusCode: randomStatusCode))
)

mockDataUploader.onUpload = { previousUploadStatus in
XCTAssertNil(previousUploadStatus)
startUploadExpectation.fulfill()
}

let featureName: String = .mockRandom()
let worker = DataUploadWorker(
queue: uploaderQueue,
fileReader: reader,
dataUploader: mockDataUploader,
contextProvider: .mockAny(),
uploadConditions: .alwaysUpload(),
delay: DataUploadDelay(performance: UploadPerformanceMock.veryQuickInitialUpload),
featureName: featureName,
telemetry: telemetry,
maxBatchesPerUpload: .mockRandom(min: 1, max: 100)
)

wait(for: [startUploadExpectation], timeout: 0.5)
worker.cancelSynchronously()

let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["failure"] as? String, "\(nserror.code)")
// Then
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "Batch Blocked"), "A Batch Blocked metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["failure"] as? String, "intake-code-\(randomStatusCode.rawValue)")
XCTAssertNil(metric.attributes["blockers"])
XCTAssertEqual(metric.attributes["track"] as? String, featureName)
}

Expand Down Expand Up @@ -751,8 +780,7 @@ class DataUploadWorkerTests: XCTestCase {
let error = try XCTUnwrap(telemetry.messages.firstError(), "An error should be send to `telemetry`.")
XCTAssertEqual(error.message, #"Failed to initiate 'some-feature' data upload - Failed to prepare upload"#)

let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_quality"), "An upload quality metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["failure"] as? String, "invalid")
let metric = try XCTUnwrap(telemetry.messages.firstMetric(named: "upload_cycle"), "An upload cycle metric should be send to `telemetry`.")
XCTAssertEqual(metric.attributes["track"] as? String, "some-feature")
}

Expand Down
Loading