|
1 | 1 | package org.jetbrains.kotlinx.dataframe.columns
|
2 | 2 |
|
3 | 3 | import org.jetbrains.kotlinx.dataframe.DataColumn
|
| 4 | +import org.jetbrains.kotlinx.dataframe.annotations.DataSchema |
| 5 | +import org.jetbrains.kotlinx.dataframe.api.DataSchemaEnum |
| 6 | +import org.jetbrains.kotlinx.dataframe.api.add |
| 7 | +import org.jetbrains.kotlinx.dataframe.api.aggregate |
| 8 | +import org.jetbrains.kotlinx.dataframe.api.column |
4 | 9 | import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
|
| 10 | +import org.jetbrains.kotlinx.dataframe.api.fillNulls |
5 | 11 | import org.jetbrains.kotlinx.dataframe.api.filter
|
| 12 | +import org.jetbrains.kotlinx.dataframe.api.groupBy |
| 13 | +import org.jetbrains.kotlinx.dataframe.api.print |
| 14 | +import org.jetbrains.kotlinx.dataframe.api.sortBy |
| 15 | +import org.jetbrains.kotlinx.dataframe.api.toDataFrame |
| 16 | +import org.jetbrains.kotlinx.dataframe.api.with |
6 | 17 | import org.jetbrains.kotlinx.dataframe.columns.ColumnDataHolder.ColumnType.BOXED_ARRAY
|
| 18 | +import org.jetbrains.kotlinx.dataframe.columns.ColumnDataHolder.ColumnType.BOXED_ARRAY_WITH_NULL |
7 | 19 | import org.jetbrains.kotlinx.dataframe.columns.ColumnDataHolder.ColumnType.DOUBLE_ARRAY
|
8 | 20 | import org.jetbrains.kotlinx.dataframe.columns.ColumnDataHolder.ColumnType.LIST
|
| 21 | +import org.jetbrains.kotlinx.dataframe.columns.ColumnDataHolder.ColumnType.LIST_WITH_NULL |
| 22 | +import org.jetbrains.kotlinx.dataframe.math.mean |
9 | 23 | import org.junit.Test
|
| 24 | +import org.openjdk.jol.info.GraphLayout |
10 | 25 | import kotlin.random.Random
|
11 | 26 | import kotlin.time.Duration
|
12 | 27 | import kotlin.time.measureTime
|
| 28 | +import kotlin.time.measureTimedValue |
13 | 29 |
|
14 | 30 | class ColumnDataHolder {
|
15 | 31 |
|
16 |
| - enum class ColumnType { |
17 |
| - LIST, |
18 |
| - BOXED_ARRAY, |
19 |
| - DOUBLE_ARRAY, |
| 32 | + enum class ColumnType(override val value: String) : DataSchemaEnum { |
| 33 | + LIST("list"), |
| 34 | + LIST_WITH_NULL("list with null"), |
| 35 | + BOXED_ARRAY("boxed array"), |
| 36 | + BOXED_ARRAY_WITH_NULL("boxed array with null"), |
| 37 | + DOUBLE_ARRAY("double array"), |
20 | 38 | }
|
21 | 39 |
|
| 40 | + @DataSchema |
| 41 | + data class Result( |
| 42 | + val type: ColumnType, |
| 43 | + val creationTime: Duration, |
| 44 | + val processingTime: Duration, |
| 45 | + val size: Long, |
| 46 | + ) |
| 47 | + |
| 48 | + // ⌌-------------------------------------------------------------------⌍ |
| 49 | + // | | type| creation| processing| size| |
| 50 | + // |--|----------------------|-----------|------------|----------------| |
| 51 | + // | 0| BOXED_ARRAY_WITH_NULL| 1.668690ms| 40.072489ms| 14500481.813333| |
| 52 | + // | 1| LIST_WITH_NULL| 9.142612ms| 41.064332ms| 14509001.813333| |
| 53 | + // | 2| LIST| 2.710987ms| 42.268814ms| 11496455.760000| |
| 54 | + // | 3| BOXED_ARRAY| 2.415740ms| 42.270087ms| 11502541.520000| |
| 55 | + // | 4| DOUBLE_ARRAY| 1.840757ms| 42.354001ms| 11499172.666667| |
| 56 | + // ⌎-------------------------------------------------------------------⌏ |
22 | 57 | @Test
|
23 | 58 | fun `measuring speed of ColumnDataHolder creation`() {
|
24 | 59 | val size = 50_000
|
25 | 60 | val content = { i: Int -> Random.nextDouble() }
|
26 | 61 | val tests = buildList {
|
27 |
| - repeat(2_000) { |
| 62 | + repeat(300) { |
28 | 63 | add(LIST)
|
| 64 | + add(LIST_WITH_NULL) |
29 | 65 | add(BOXED_ARRAY)
|
| 66 | + add(BOXED_ARRAY_WITH_NULL) |
30 | 67 | add(DOUBLE_ARRAY)
|
31 | 68 | }
|
32 | 69 | }.shuffled()
|
33 | 70 |
|
34 |
| - val results = mapOf( |
35 |
| - LIST to mutableListOf<Duration>(), |
36 |
| - BOXED_ARRAY to mutableListOf(), |
37 |
| - DOUBLE_ARRAY to mutableListOf(), |
38 |
| - ) |
| 71 | + val results = mutableListOf<Result>() |
| 72 | + |
| 73 | + val a by column<Double>() |
| 74 | + val b by column<Double>() |
| 75 | + val c by column<Double>() |
| 76 | + val d by column<Double>() |
39 | 77 |
|
40 | 78 | for (test in tests) {
|
41 |
| - val time = measureTime { |
42 |
| - val df = when (test) { |
| 79 | + val (df, time1) = measureTimedValue { |
| 80 | + when (test) { |
43 | 81 | LIST -> dataFrameOf(
|
44 |
| - DataColumn.createValueColumn("a", List(size, content)), |
45 |
| - DataColumn.createValueColumn("b", List(size, content)), |
| 82 | + DataColumn.createValueColumn(a.name(), List(size, content)), |
| 83 | + DataColumn.createValueColumn(b.name(), List(size, content)), |
| 84 | + DataColumn.createValueColumn(c.name(), List(size, content)), |
| 85 | + ) |
| 86 | + |
| 87 | + LIST_WITH_NULL -> dataFrameOf( |
| 88 | + DataColumn.createValueColumn(a.name(), List<Double?>(size - 1, content) + null), |
| 89 | + DataColumn.createValueColumn(b.name(), List<Double?>(size - 1, content) + null), |
| 90 | + DataColumn.createValueColumn(c.name(), List<Double?>(size - 1, content) + null), |
46 | 91 | )
|
47 | 92 |
|
48 | 93 | BOXED_ARRAY -> dataFrameOf(
|
49 |
| - DataColumn.createValueColumn("a", Array(size, content)), |
50 |
| - DataColumn.createValueColumn("b", Array(size, content)), |
| 94 | + DataColumn.createValueColumn(a.name(), Array(size, content)), |
| 95 | + DataColumn.createValueColumn(b.name(), Array(size, content)), |
| 96 | + DataColumn.createValueColumn(c.name(), Array(size, content)), |
| 97 | + ) |
| 98 | + |
| 99 | + BOXED_ARRAY_WITH_NULL -> dataFrameOf( |
| 100 | + DataColumn.createValueColumn(a.name(), Array<Double?>(size - 1, content) + null), |
| 101 | + DataColumn.createValueColumn(b.name(), Array<Double?>(size - 1, content) + null), |
| 102 | + DataColumn.createValueColumn(c.name(), Array<Double?>(size - 1, content) + null), |
51 | 103 | )
|
52 | 104 |
|
53 | 105 | DOUBLE_ARRAY -> dataFrameOf(
|
54 |
| - DataColumn.createValueColumn("a", DoubleArray(size, content)), |
55 |
| - DataColumn.createValueColumn("b", DoubleArray(size, content)), |
| 106 | + DataColumn.createValueColumn(a.name(), DoubleArray(size, content)), |
| 107 | + DataColumn.createValueColumn(b.name(), DoubleArray(size, content)), |
| 108 | + DataColumn.createValueColumn(c.name(), DoubleArray(size, content)), |
56 | 109 | )
|
57 | 110 | }
|
| 111 | + } |
58 | 112 |
|
59 |
| - df.filter { "a"<Double>() > "b"<Double>() } |
| 113 | + val time2 = measureTime { |
| 114 | + df.fillNulls { a and b and c }.with { 0.0 } |
| 115 | + .filter { a() > b() } |
| 116 | + .add(d) { a() + b() + c() } |
60 | 117 | }
|
61 | 118 |
|
62 |
| - results[test]!!.add(time) |
63 |
| - } |
| 119 | + val footprint = GraphLayout.parseInstance(df).toFootprint() |
| 120 | + val size = footprint.lines() |
| 121 | + .last { "total" in it } |
| 122 | + .split(" ") |
| 123 | + .mapNotNull { it.toLongOrNull() } |
| 124 | + .last() |
64 | 125 |
|
65 |
| - println("Results:") |
66 |
| - results.forEach { (type, times) -> |
67 |
| - println("$type: ${times.mean()}") |
| 126 | + results += Result(test, time1, time2, size) |
68 | 127 | }
|
| 128 | + |
| 129 | + results.toDataFrame() |
| 130 | + .groupBy { type } |
| 131 | + .aggregate { |
| 132 | + creationTime.toList().mean() into "creation" |
| 133 | + processingTime.toList().mean() into "processing" |
| 134 | + this.size.toList().mean() into "size" |
| 135 | + } |
| 136 | + .sortBy { "processing"() } |
| 137 | + .print(borders = true, title = true) |
| 138 | + |
| 139 | + results |
69 | 140 | }
|
70 | 141 |
|
71 | 142 | fun Collection<Duration>.mean(): Duration = reduce { acc, duration -> acc + duration } / size
|
|
0 commit comments