diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/aggregation/AggregateDsl.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/aggregation/AggregateDsl.kt index ea3226ec79..2c4df46cf7 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/aggregation/AggregateDsl.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/aggregation/AggregateDsl.kt @@ -1,6 +1,7 @@ package org.jetbrains.kotlinx.dataframe.aggregation import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload import org.jetbrains.kotlinx.dataframe.annotations.HasSchema import org.jetbrains.kotlinx.dataframe.annotations.Interpretable import org.jetbrains.kotlinx.dataframe.api.ColumnSelectionDsl @@ -9,6 +10,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor import org.jetbrains.kotlinx.dataframe.impl.aggregation.ValueWithDefault import org.jetbrains.kotlinx.dataframe.impl.aggregation.receivers.internal import org.jetbrains.kotlinx.dataframe.impl.columnName +import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API import kotlin.reflect.KProperty import kotlin.reflect.typeOf @@ -21,9 +23,13 @@ public abstract class AggregateDsl : public inline infix fun R.into(name: String): NamedValue = internal().yield(pathOf(name), this, typeOf()) + @Deprecated(DEPRECATED_ACCESS_API) + @AccessApiOverload public inline infix fun R.into(column: ColumnAccessor): NamedValue = internal().yield(pathOf(column.name()), this, typeOf()) + @Deprecated(DEPRECATED_ACCESS_API) + @AccessApiOverload public inline infix fun R.into(column: KProperty): NamedValue = internal().yield(pathOf(column.columnName), this, typeOf()) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/group.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/group.kt index 9367a8e6ad..6357f00e5e 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/group.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/group.kt @@ -24,7 +24,8 @@ import kotlin.reflect.KProperty // region DataFrame /** - * Groups the specified [columns\] within the [DataFrame]. + * Groups the specified [columns\] within the [DataFrame] into + * [column group][ColumnGroup]. * * This function does not immediately group the columns but instead select columns to group and * returns a [GroupClause], diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt index 5c670a77ef..6e364189a6 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt @@ -2,16 +2,31 @@ package org.jetbrains.kotlinx.dataframe.api import org.jetbrains.kotlinx.dataframe.AnyColumnReference import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.ColumnSelector import org.jetbrains.kotlinx.dataframe.ColumnsSelector +import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataFrameExpression import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.RowExpression +import org.jetbrains.kotlinx.dataframe.RowFilter import org.jetbrains.kotlinx.dataframe.Selector import org.jetbrains.kotlinx.dataframe.aggregation.Aggregatable +import org.jetbrains.kotlinx.dataframe.aggregation.AggregateDsl +import org.jetbrains.kotlinx.dataframe.aggregation.ColumnsForAggregateSelectionDsl import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload import org.jetbrains.kotlinx.dataframe.annotations.Interpretable import org.jetbrains.kotlinx.dataframe.annotations.Refine +import org.jetbrains.kotlinx.dataframe.api.GroupByDocs.Grammar import org.jetbrains.kotlinx.dataframe.columns.FrameColumn import org.jetbrains.kotlinx.dataframe.columns.toColumnSet +import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls +import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarLink +import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources +import org.jetbrains.kotlinx.dataframe.documentation.Indent +import org.jetbrains.kotlinx.dataframe.documentation.LineBreak +import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns +import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns.OPERATION import org.jetbrains.kotlinx.dataframe.impl.aggregation.PivotImpl import org.jetbrains.kotlinx.dataframe.impl.api.getPivotColumnPaths import org.jetbrains.kotlinx.dataframe.impl.api.groupByImpl @@ -21,14 +36,289 @@ import kotlin.reflect.KProperty // region DataFrame /** + * Groups the rows of this [DataFrame] based on the values in one or more specified [key columns][cols]. + * Each unique value in a key column — or a unique tuple of values for multiple columns — + * defines the group consisting of all rows where the column(s) contain that value combination. * - * @param cols key columns; Column for grouping can be created inplace + * Returns a [GroupBy] — a dataframe-like structure that contains all unique combinations of key values + * along with the corresponding groups of rows (each represented as a [DataFrame]). * - * `df.groupBy { expr("columnName") { "someColumn"() + 15 } }` + * A [GroupBy] can then be: + * * [transformed][Transformation] into a new [GroupBy]; + * * [reduced][Reducing] into a [DataFrame], where each group is collapsed into a single representative row; + * * [aggregated][Aggregation] into a [DataFrame], where each group is transformed into one or more rows of derived values; + * * [pivoted][Pivoting] into a [PivotGroupBy] structure, which combines [pivot] and [groupBy] operations. * - * is equivalent to + * Grouping keys can also be created inline: + * ```kotlin + * // Create a new column "newName" based on existing "oldName" values + * // and use it as a grouping key: + * df.groupBy { expr("newName") { oldName.drop(5) } } + * ``` * - * `df.add("columnName") { "someColumn"() + 15 }.groupBy("columnName")` + * Check out [Grammar]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][GroupBySelectingOptions]. + * + * For more information: {@include [DocumentationUrls.GroupBy]} + * + * Don't confuse this with [group], which groups column into + * [column group][org.jetbrains.kotlinx.dataframe.columns.ColumnGroup]. + */ +internal interface GroupByDocs { + /** + * ## [groupBy][groupBy] Operation Grammar + * {@include [LineBreak]} + * {@include [DslGrammarLink]} + * {@include [LineBreak]} + * + * ### Create and transform [GroupBy] + * + * [**`groupBy`**][groupBy]**`(`**`moveToTop: `[`Boolean`][Boolean]**` = true) { `**`columns: `[`ColumnsSelector`][ColumnsSelector]**` }`** + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByGroup`**][GroupBy.sortByGroup]**`() `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByGroupDesc`**][GroupBy.sortByGroupDesc]**`() `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByCount`**][GroupBy.sortByCount]**`() `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByCountAsc`**][GroupBy.sortByCountAsc]**`() `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByKey`**][GroupBy.sortByKey]**`() `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByKeyDesc`**][GroupBy.sortByKeyDesc]**`() `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortBy`**][GroupBy.sortBy]**` { `**`columns: `[`ColumnsSelector`][ColumnsSelector]**` } `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`sortByDesc`**][GroupBy.sortByDesc]**` { `**`columns: `[`ColumnsSelector`][ColumnsSelector]**` } `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`updateGroups`**][GroupBy.updateGroups]**` { `**`frameExpression`**` } `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`filter`**][GroupBy.filter]**` { `**`predicate: `[`GroupedRowFilter`][GroupedRowFilter]**` } `**`]` + * + * {@include [Indent]} + * `\[ `__`.`__[**`add`**][GroupBy.add]**`(`**`column: `[`DataColumn`][DataColumn]**`) { `**`rowExpression: `[`RowExpression`][RowExpression]**` } `**`]` + * + * ### Reduce [GroupBy] into [DataFrame] + * + * {@include [Indent]} + * [GroupBy][GroupBy]`.`[**`minBy`**][GroupBy.minBy]**` { `**`column: `[`ColumnSelector`][ColumnSelector]**` }`** + * + * {@include [Indent]} + * `| `__`.`__[**`maxBy`**][GroupBy.maxBy]**` { `**`column: `[`ColumnSelector`][ColumnSelector]**` }`** + * + * {@include [Indent]} + * `| `__`.`__[**`first`**][GroupBy.first]` \[ `**` { `**`rowCondition: `[`RowFilter`][RowFilter]**` } `**`]` + * + * {@include [Indent]} + * `| `__`.`__[**`last`**][GroupBy.last]` \[ `**` { `**`rowCondition: `[`RowFilter`][RowFilter]**` } `**`]` + * + * {@include [Indent]} + * __`.`__[**`concat`**][ReducedGroupBy.concat]**`() `** + * + * {@include [Indent]} + * `| `__`.`__[**`into`**][ReducedGroupBy.into]**`(`**`column: `[`String`][String]**`) `**` \[ `**`{ `**`rowExpression: `[`RowExpression`][RowExpression]**` } `**`]` + * + * {@include [Indent]} + * `| `__`.`__[**`values`**][ReducedGroupBy.values]**` { `**`valueColumns: `[`ColumnsSelector`][ColumnsSelector]**` }`** + * + * ### Aggregate [GroupBy] into [DataFrame] + * + * {@include [Indent]} + * [GroupBy][GroupBy]`.`[**`concat`**][GroupBy.concat]**`() `** + * + * {@include [Indent]} + * `| `__`.`__[**`concatWithKeys`**][GroupBy.concatWithKeys]**`() `** + * + * {@include [Indent]} + * `| `__`.`__[**`into`**][GroupBy.into]**`(`**`column: `[`String`][String]**`) `**` \[ `**`{ `**`rowExpression: `[`RowExpression`][RowExpression]**` } `**`]` + * + * {@include [Indent]} + * `| `__`.`__[**`values`**][Grouped.values]**` { `**`valueColumns: `[`ColumnsSelector`][ColumnsSelector]**` }`** + * + * {@include [Indent]} + * `| `__`.`__[**`count`**][Grouped.count]**`() `** + * + * {@include [Indent]} + * `| `__`.`__[**`aggregate`**][Grouped.aggregate]**` { `**`aggregations: `[`AggregateDsl`][AggregateDsl]**` }`** + * + * {@include [Indent]} + * `| `__`.`__[][AggregationStatistics] + * + * ### Pivot [GroupBy] into [PivotGroupBy] and reduce / aggregate it + * + * {@include [Indent]} + * `| `__`.`__[**`pivot`**][GroupBy.pivot]**` { `**`columns: `[`ColumnsSelector`][ColumnsSelector]**` }`** + * + * {@include [Indent]} + * ` \[ `__`.`__[**`default`**][PivotGroupBy.default]**`(`**`defaultValue`**`) `**`]` + * + * {@include [Indent]} + * `| `__`.`__[][PivotGroupByDocs.Reducing] + * + * {@include [Indent]} + * `| `__`.`__[][PivotGroupByDocs.Aggregation] + * + * Check out [PivotGroupBy Grammar][PivotGroupByDocs.Grammar] for more information. + */ + interface Grammar + + /** + * {@comment Version of [SelectingColumns] with correctly filled in examples} + * @include [SelectingColumns] {@include [SetGroupByOperationArg]} + */ + interface GroupBySelectingOptions + + /** + * ### [GroupBy] aggregation statistics + * * [count][Grouped.count] + * * [max][Grouped.max]/[maxOf][Grouped.maxOf]/[maxFor][Grouped.maxFor] + * * [min][Grouped.min]/[minOf][Grouped.minOf]/[minFor][Grouped.minFor] + * * [sum][Grouped.sum]/[sumOf][Grouped.sumOf]/[sumFor][Grouped.sumFor] + * * [mean][Grouped.mean]/[meanOf][Grouped.meanOf]/[meanFor][Grouped.meanFor] + * * [std][Grouped.std]/[stdOf][Grouped.stdOf]/[stdFor][Grouped.stdFor] + * * [median][Grouped.median]/[medianOf][Grouped.medianOf]/[medianFor][Grouped.medianFor] + * * [percentile][Grouped.percentile]/[percentileOf][Grouped.percentileOf]/[percentileFor][Grouped.percentileFor] + */ + interface AggregationStatistics + + /** + * ### [GroupBy] transformations + * + * A [GroupBy] can be transformed into a new [GroupBy] using one of the following methods: + * * [sortByGroup][GroupBy.sortByGroup] / [sortByGroupDesc][GroupBy.sortByGroupDesc] — sorts the **order + * of groups** (and their corresponding keys) by values computed with a [DataFrameExpression] applied to each group; + * * [sortByCount][GroupBy.sortByCount] / [sortByCountAsc][GroupBy.sortByCountAsc] — sorts the **order + * of groups** (and their corresponding keys) by the number of rows they contain; + * * [sortByKey][GroupBy.sortByKey] / [sortByKeyDesc][GroupBy.sortByKeyDesc] — sorts the **order + * of groups** (and their corresponding keys) by the grouping key values; + * * [sortBy][GroupBy.sortBy] / [sortByDesc][GroupBy.sortByDesc] — sorts the **order of rows within each group** + * by one or more column values; + * * [updateGroups][GroupBy.updateGroups] — transforms each group into a new one; + * * [filter][GroupBy.filter] — filters group rows by the given predicate (as usual [DataFrame.filter]); + * * [add][GroupBy.add] — adds a new column to each group. + * + * Each method returns a new [GroupBy] with updated group order or modified group content. + * + * For more information: {@include [DocumentationUrls.GroupByTransformation]} + */ + interface Transformation + + /** + * ### [GroupBy] reducing + * + * Each [GroupBy] group can be collapsed into a single row and then concatenated + * into a new [DataFrame] composed of these rows. + * + * Reducing is a specific case of [aggregation][Aggregation]. + * + * First, choose a [GroupBy] reducing method: + * * [first][GroupBy.first], [last][GroupBy.last] — take the first or last row + * (optionally, the first or last one that satisfies a predicate) of each group; + * * [minBy][GroupBy.minBy] / [maxBy][GroupBy.maxBy] — take the row with the minimum or maximum value + * of the given [RowExpression] calculated on rows within each group; + * * [medianBy][GroupBy.medianBy] / [percentileBy][GroupBy.percentileBy] — take the row with + * the median or specific percentile value of the given [RowExpression] calculated on rows within each group; + * + * These functions return a [ReducedGroupBy], which can then be transformed into a new [DataFrame] + * containing the reduced rows (either original or transformed) using one of the following methods: + * * [concat][ReducedGroupBy.concat] — simply concatenates all reduced rows; + * * [values][ReducedGroupBy.values] — creates a [DataFrame] with new rows by transforming each reduced row + * using [ColumnsForAggregateSelectionDsl]; + * * [into][ReducedGroupBy.into] — creates a new column with values computed with [RowExpression] on each row, + * or a new [column group][org.jetbrains.kotlinx.dataframe.columns.ColumnGroup] + * containing each group reduced to a single row; + * + * Each method returns a new [DataFrame] that includes the grouping key columns, + * containing all unique grouping key values (or value combinations for multiple keys) + * along with their corresponding reduced rows. + * + * Check out [`GroupBy grammar`][Grammar]. + * + * For more information: {@include [DocumentationUrls.GroupByReducing]} + */ + interface Reducing + + /** + * ### [GroupBy] aggregation + * + * Each [GroupBy] can be directly transformed into a new [DataFrame] by applying one or more + * aggregation operations to its groups. + * + * Aggregation is a generalization of [reducing][Reducing]. + * + * The following aggregation methods are available: + * * [concat][GroupBy.concat] — concatenates all rows from all groups into a single [DataFrame], + * without preserving grouping keys; + * * [concatWithKeys][GroupBy.concatWithKeys] — a variant of [concat][GroupBy.concat] that also includes + * grouping keys that were not present in the original [DataFrame]; + * * [into][GroupBy.into] — creates a new column containing a list of values computed with a [RowExpression] + * for each group, or a new [frame column][org.jetbrains.kotlinx.dataframe.columns.FrameColumn] + * containing the groups themselves; + * * [values][ReducedGroupBy.values] — creates a [DataFrame] with new rows produced by transforming + * each group using [ColumnsForAggregateSelectionDsl]; + * * [count][Grouped.count] — returns a [DataFrame] containing the grouping key columns and an additional column + * with the number of rows in each corresponding group; + * * [aggregate][Grouped.aggregate] — performs a set of custom aggregations using [AggregateDsl], + * allowing you to compute one or more derived values per group; + * * [Various aggregation statistics][AggregationStatistics] — predefined shortcuts + * for common statistical aggregations such as [sum][Grouped.sum], [mean][Grouped.mean], + * [median][Grouped.median], and others. + * + * Each of these methods returns a new [DataFrame] that includes the grouping key columns + * (except for [concat][GroupBy.concat]) along with the columns of values aggregated + * from the corresponding groups. + * + * Check out [`GroupBy grammar`][Grammar]. + * + * For more information: {@include [DocumentationUrls.GroupByAggregation]} + */ + interface Aggregation + + /** + * ### [GroupBy] pivoting + * + * [GroupBy] can be pivoted with [pivot][GroupBy.pivot] method. It will produce a [PivotGroupBy]. + * + * @include [PivotGroupByDocs.CommonDescription] + */ + interface Pivoting +} + +/** {@set [SelectingColumns.OPERATION] [groupBy][groupBy]} */ +@ExcludeFromSources +private interface SetGroupByOperationArg + +/** + * {@include [GroupByDocs]} + * ### This `groupBy` Overload + */ +@ExcludeFromSources +private interface CommonGroupByDocs + +/** + * @include [CommonGroupByDocs] + * @include [SelectingColumns.Dsl.WithExample] {@include [SetGroupByOperationArg]} + * + * @param [moveToTop] Specifies whether nested grouping columns should be moved to the top level + * or kept inside a [ColumnGroup][org.jetbrains.kotlinx.dataframe.columns.ColumnGroup]. + * Defaults to `true`. + * @param [cols] The [Columns Selector][ColumnsSelector] that defines which columns are used + * as keys for grouping. + * @return A new [GroupBy] containing the unique combinations of values from the provided [key columns][cols], + * together with their corresponding groups of rows. */ @Refine @Interpretable("DataFrameGroupBy") @@ -39,6 +329,15 @@ public fun DataFrame.groupBy(moveToTop: Boolean = true, cols: ColumnsSele @AccessApiOverload public fun DataFrame.groupBy(vararg cols: KProperty<*>): GroupBy = groupBy { cols.toColumnSet() } +/** + * @include [CommonGroupByDocs] + * @include [SelectingColumns.ColumnNames.WithExample] {@include [SetGroupByOperationArg]} + * + * @param [cols] The [Column names][String] that defines which columns are used + * as keys for grouping. + * @return A new [GroupBy] containing the unique combinations of values from the provided [key columns][cols], + * together with their corresponding groups of rows. + */ public fun DataFrame.groupBy(vararg cols: String): GroupBy = groupBy { cols.toColumnSet() } @Deprecated(DEPRECATED_ACCESS_API) @@ -48,8 +347,54 @@ public fun DataFrame.groupBy(vararg cols: AnyColumnReference, moveToTop: // endregion +/** + * Groups the rows of this [Pivot] into [PivotGroupBy] + * based on the values in one or more specified [key columns][\columns]. + * + * Works like regular [DataFrame.groupBy] on pivot groups. + * + * Grouping keys can also be created inline: + * ```kotlin + * // Create a new column "newName" based on existing "oldName" values + * // and use it as a grouping key: + * pivot.groupBy { expr("newName") { oldName.drop(5) } } + * ``` + * + * @include [PivotGroupByDocs.CommonDescription] + */ +@ExcludeFromSources +private interface GroupByForPivotDocs + +/** + * {@include [GroupByForPivotDocs]} + * ### This `groupBy` Overload + */ +@ExcludeFromSources +private interface CommonGroupByForPivotDocs + // region Pivot +/** + * {@include [CommonGroupByForPivotDocs]} + * @include [SelectingColumns.Dsl] + * + * #### For example: + * + * `pivot.`{@get [OPERATION]}` { length `[and][ColumnsSelectionDsl.and]` age }` + * + * `pivot.`{@get [OPERATION]}` { `[cols][ColumnsSelectionDsl.cols]`(1..5) }` + * + * `pivot.`{@get [OPERATION]}` { `[colsOf][ColumnsSelectionDsl.colsOf]`<`[Double][Double]`>() }` + * + * {@include [SetGroupByOperationArg]} + * @param moveToTop Specifies whether nested grouping columns should be moved to the top level + * or kept inside a [ColumnGroup][org.jetbrains.kotlinx.dataframe.columns.ColumnGroup]. + * Defaults to `true`. + * @param [columns] The [Columns Selector][ColumnsSelector] that defines which columns are used + * as keys for grouping. + * @return A new [PivotGroupBy] that preserves the original [pivot] key columns + * and uses the provided columns as [groupBy] keys. + */ public fun Pivot.groupBy(moveToTop: Boolean = true, columns: ColumnsSelector): PivotGroupBy = (this as PivotImpl).toGroupedPivot(moveToTop, columns) @@ -57,12 +402,36 @@ public fun Pivot.groupBy(moveToTop: Boolean = true, columns: ColumnsSelec @AccessApiOverload public fun Pivot.groupBy(vararg columns: AnyColumnReference): PivotGroupBy = groupBy { columns.toColumnSet() } +/** + * {@include [CommonGroupByForPivotDocs]} + * @include [SelectingColumns.ColumnNames] + * + * #### For example: + * + * `df.`{@get [OPERATION]}`("length", "age")` + * + * {@include [SetGroupByOperationArg]} + * @param [columns] The [Column names][String] that defines which columns are used + * as keys for grouping. + * @return A new [PivotGroupBy] that preserves the original [pivot] key columns + * and uses the provided columns as [groupBy] keys. + */ public fun Pivot.groupBy(vararg columns: String): PivotGroupBy = groupBy { columns.toColumnSet() } @Deprecated(DEPRECATED_ACCESS_API) @AccessApiOverload public fun Pivot.groupBy(vararg columns: KProperty<*>): PivotGroupBy = groupBy { columns.toColumnSet() } +/** + * Groups the rows of this [Pivot] into a [PivotGroupBy] + * based on the values of all columns except the pivot key columns. + * For example, if a [DataFrame] has columns `"a"`, `"b"`, `"c"`, `"d"` and is pivoted by + * `"a"` and `"c"`, then this [Pivot] will be grouped by the remaining columns `"b"` and `"d"`. + * + * @include [PivotGroupByDocs.CommonDescription] + * @return A new [PivotGroupBy] that preserves the original [pivot] key columns + * and uses the remaining columns as [groupBy] keys. + */ public fun Pivot.groupByOther(): PivotGroupBy { val impl = this as PivotImpl val pivotColumns = df.getPivotColumnPaths(columns).toColumnSet() @@ -71,33 +440,125 @@ public fun Pivot.groupByOther(): PivotGroupBy { // endregion +/** + * A specialized lambda that provides a [GroupedDataRow] both as the receiver and as the argument (`this` and `it`) + * and produces a result of type [R]. + */ public typealias GroupedRowSelector = GroupedDataRow.(GroupedDataRow) -> R +/** + * A specialized lambda that provides a [GroupedDataRow] both as the receiver and as the argument (`this` and `it`) + * and returns a [Boolean] value used for filtering. + */ public typealias GroupedRowFilter = GroupedRowSelector +/** + * A specialized form of [DataRow] representing a single row of a [GroupBy]. + * Each instance contains the key values and a reference to the corresponding [group]. + */ public interface GroupedDataRow : DataRow { + /** + * The [DataFrame] representing the group corresponding to the current key values. + */ public fun group(): DataFrame } -public val GroupedDataRow.group: DataFrame get() = group() +/** + * The [DataFrame] representing the group corresponding to the current key values. + */ +public val GroupedDataRow.group: DataFrame + get() = group() +/** + * An alternative representation of a [GroupBy.Entry], holding a key–group pair. + * + * @property key The key represented as a [DataRow]. + * @property group The [DataFrame] containing the rows belonging to this group. + */ public data class GroupWithKey(val key: DataRow, val group: DataFrame) +/** + * A dataframe-like structure that contains all unique combinations of key-values + * along with the corresponding groups of rows (each represented as a [DataFrame]). + * + * Consists of two main parts: + * * [groups] — represents the groups as a [FrameColumn], where each cell contains a [DataFrame] + * with the rows that belong to a specific group. + * * [keys] — represents the grouping keys as a [DataFrame], containing one column for each key column. + * Each row in [keys] corresponds to a group in [groups]. + * + * Together, the rows of [keys] and [groups] define one-to-one **key–group pairs**. + * + * @param G The schema of the groups (same as the schema of the original [DataFrame]). + * @param T The schema of the grouping keys. + */ public interface GroupBy : Grouped { + /** + * A [FrameColumn] representing all groups of rows. + * Each cell contains a [DataFrame] with the subset of rows that share the same key values. + */ public val groups: FrameColumn + /** + * A [DataFrame] representing the grouping keys. + * Each column corresponds to a key column, and each row corresponds to a unique group. + */ public val keys: DataFrame + /** + * Creates a new [GroupBy] by transforming each group’s [DataFrame] + * using the provided [transform] function. + * + * @param [transform] A lambda that takes each group as a [DataFrame] + * (available both as a receiver and as a parameter) and returns a transformed [DataFrame]. + * @return A new [GroupBy] instance containing the transformed groups. + */ public fun updateGroups(transform: Selector, DataFrame>): GroupBy + /** + * Filters the rows of this [GroupBy] — that is, the key–group pairs — based on the specified [predicate]. + * + * The [predicate] is a [GroupedRowFilter], which behaves similarly to a [RowFilter] used in [DataFrame.filter], + * but also provides access to the [group][GroupedDataRow.group] in the current row. + * + * ### Example + * ```kotlin + * // Keep only key–group pairs where the "category" key equals "Engineer" + * // or where the group contains at least 5 rows + * gb.filter { category == "Engineer" || group.rowsCount() >= 5 } + * ``` + * + * @param [predicate] A [GroupedRowFilter] used to determine which groups should be retained. + * @return A new [GroupBy] containing only the key–group pairs that satisfy the [predicate]. + */ public fun filter(predicate: GroupedRowFilter): GroupBy + /** + * Converts this [GroupBy] into a [DataFrame]. + * + * Each row of the resulting [DataFrame] represents a unique key–group pair: + * a row from [keys] and its corresponding group of rows (as [DataFrame]). + * + * If [groupedColumnName] is provided, the groups will be stored + * in a [FrameColumn] with that name; otherwise, a default name "group" is used. + * + * @param groupedColumnName The name of the column in which to store grouped data; + * if `null`, a default name will be used. + * @return A new [DataFrame] that includes the grouping key columns together + * with a [FrameColumn] containing the corresponding groups. + */ @Refine @Interpretable("GroupByToDataFrame") public fun toDataFrame(groupedColumnName: String? = null): DataFrame + /** + * Represents a single key–group pair in a [GroupBy]. + * + * @property key The key of the group, represented as a [DataRow]. + * @property group The [DataFrame] containing all rows that belong to the group. + */ public data class Entry(val key: DataRow, val group: DataFrame) public companion object { @@ -105,8 +566,34 @@ public interface GroupBy : Grouped { } } +/** + * Represents a dataframe-like structure with grouped values, offering aggregation capabilities. + */ public interface Grouped : Aggregatable +/** + * An intermediate class used in [`GroupBy` reducing][GroupByDocs.Reducing] operations. + * + * Serves as a transitional step between performing a reduction on groups + * and specifying how the resulting reduced rows should be represented + * in a new [DataFrame]. + * + * Available transformation methods: + * * [concat][ReducedGroupBy.concat] — concatenates all reduced rows into a single [DataFrame]; + * * [values][ReducedGroupBy.values] — creates a [DataFrame] with new rows by transforming each reduced row + * using [ColumnsForAggregateSelectionDsl]; + * * [into][ReducedGroupBy.into] — creates a new column with values computed using a [RowExpression] for each row, + * or a new [column group][org.jetbrains.kotlinx.dataframe.columns.ColumnGroup] + * containing each group reduced to a single row. + * + * Each method returns a new [DataFrame] that includes the grouping key columns, + * containing all unique grouping key values (or value combinations for multiple keys) + * together with their corresponding reduced rows. + * + * See also: [`GroupBy grammar`][Grammar]. + * + * For more information, refer to: {@include [DocumentationUrls.GroupByReducing]} + */ public class ReducedGroupBy( @PublishedApi internal val groupBy: GroupBy, @PublishedApi internal val reducer: Selector, DataRow?>, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/pivot.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/pivot.kt index 286a40e5b7..5f056d6283 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/pivot.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/pivot.kt @@ -12,6 +12,7 @@ import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload import org.jetbrains.kotlinx.dataframe.columns.ColumnSet import org.jetbrains.kotlinx.dataframe.columns.ColumnsResolver import org.jetbrains.kotlinx.dataframe.columns.toColumnSet +import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls import org.jetbrains.kotlinx.dataframe.impl.aggregation.PivotGroupByImpl import org.jetbrains.kotlinx.dataframe.impl.aggregation.PivotImpl import org.jetbrains.kotlinx.dataframe.impl.aggregation.PivotInAggregateImpl @@ -275,6 +276,37 @@ internal fun Pivot.reduce(reducer: Selector, DataRow?>): internal inline fun Pivot.delegate(crossinline body: PivotGroupBy.() -> DataFrame): DataRow = body(groupBy { none() })[0] +/** + * TODO (#1536) + */ +internal interface PivotGroupByDocs { + + /** + * [PivotGroupBy] is a dataframe-like structure, combining [Pivot] and [GroupBy] + * and representing a matrix table with vertical [Pivot] groups (as columns) + * and horizontal [GroupBy] groups (as rows), and each cell + * represents a group corresponding both to [GroupBy] and [Pivot] key. + * + * Reversed order of `pivot` and `groupBy` + * (i.e., [DataFrame.pivot] + [Pivot.groupBy] or [DataFrame.groupBy] + [GroupBy.pivot]) + * will produce the same result. + * + * [PivotGroupBy] can be [reduced][PivotGroupByDocs.Reducing] + * or [aggregated][PivotGroupByDocs.Aggregation]. + * + * Check out [PivotGroupBy Grammar][PivotGroupByDocs.Grammar]. + * + * For more information: {@include [DocumentationUrls.PivotGroupBy]} + */ + interface CommonDescription + + interface Grammar + + interface Reducing + + interface Aggregation +} + public interface PivotGroupBy : Aggregatable { public fun aggregate(separate: Boolean = false, body: AggregateBody): DataFrame diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt index e6a3882281..12a74b9bf1 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt @@ -140,4 +140,19 @@ internal interface DocumentationUrls { /** [See `rename` on the documentation website.]({@include [Url]}/rename.html) */ interface Rename + + /** [See `groupBy` on the documentation website.]({@include [Url]}/groupby.html) */ + interface GroupBy + + /** [See "`GroupBy` Transformation" on the documentation website.]({@include [Url]}/groupby.html#transformation) */ + interface GroupByTransformation + + /** [See "`GroupBy` Reducing" on the documentation website.]({@include [Url]}/groupby.html#reducing) */ + interface GroupByReducing + + /** [See "`GroupBy` Aggregation" on the documentation website.]({@include [Url]}/groupby.html#aggregation) */ + interface GroupByAggregation + + /** [See "`pivot` + `groupBy`" on the documentation website.]({@include [Url]}/groupby.html#pivot-groupby) */ + interface PivotGroupBy } diff --git a/docs/StardustDocs/topics/groupBy.md b/docs/StardustDocs/topics/groupBy.md index 3844ae5b0f..5055d2d7a2 100644 --- a/docs/StardustDocs/topics/groupBy.md +++ b/docs/StardustDocs/topics/groupBy.md @@ -142,7 +142,7 @@ df.groupBy { name }.concat() -## Aggregation + To compute one or several [statistics](summaryStatistics.md) per every group of `GroupBy` use `aggregate` function. Its body will be executed for every data group and has a receiver of type [`DataFrame`](DataFrame.md) that represents current data group being aggregated.