Skip to content

Commit

Permalink
Merge pull request #1082 from Kotlin/dynamic_df_builder
Browse files Browse the repository at this point in the history
DynamicDataFrameBuilder improvements
  • Loading branch information
AndreiKingsley authored Mar 5, 2025
2 parents e760eea + a65a5c9 commit a84c30c
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 26 deletions.
3 changes: 3 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -2200,7 +2200,10 @@ public final class org/jetbrains/kotlinx/dataframe/api/DuplicateKt {

public final class org/jetbrains/kotlinx/dataframe/api/DynamicDataFrameBuilder {
public fun <init> ()V
public fun <init> (Z)V
public synthetic fun <init> (ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
public final fun add (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Ljava/lang/String;
public final fun get (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public final fun toDataFrame ()Lorg/jetbrains/kotlinx/dataframe/DataFrame;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,29 +413,78 @@ public class DataFrameBuilder(private val header: List<String>) {
}

/**
* Helper class for implementing operations when column names can be potentially duplicated.
* For example, operations involving multiple dataframes, computed columns or parsing some third-party data
* A builder class for dynamically constructing a DataFrame with provided columns.
* Allows adding columns manually while automatically handling duplicate column names by assigning unique names.
*
* @property checkDuplicateValues Whether to check for duplicate column (with identical names and values)
* when adding new columns. `true` by default.
*/
public class DynamicDataFrameBuilder {
private var cols: MutableList<AnyCol> = mutableListOf()
public class DynamicDataFrameBuilder(private val checkDuplicateValues: Boolean = true) {
private var cols: MutableMap<String, AnyCol> = mutableMapOf()
private val generator = ColumnNameGenerator()

/**
* Adds a column to the builder, ensuring its name is unique.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If [checkDuplicateValues] is `true`, the method checks whether the new column has identical values
* to an existing column with the same name. If the values match, the column is not added.
*
* @param col The column to add to the DataFrame builder.
* @return The final unique name assigned to the column.
*/
public fun add(col: AnyCol): String {
val uniqueName = if (col.name().isEmpty()) {
val originalName = col.name()
if (checkDuplicateValues && generator.contains(originalName)) {
if (cols[originalName] == col) return originalName
}
val uniqueName = if (originalName.isEmpty()) {
generator.addUnique(UNNAMED_COLUMN_PREFIX)
} else {
generator.addUnique(col.name())
generator.addUnique(originalName)
}
val renamed = if (uniqueName != col.name()) {
val renamed = if (uniqueName != originalName) {
col.rename(uniqueName)
} else {
col
}
cols.add(renamed)
cols.put(uniqueName, renamed)
return uniqueName
}

public fun toDataFrame(): DataFrame<*> = dataFrameOf(cols)
/**
* Adds a column to the builder from the given iterable of values, ensuring the column's name is unique.
*
* The method automatically converts the given iterable into a column using the specified or default name
* and infers the type of the column's elements.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If the [checkDuplicateValues] property of the builder is `true`, the method checks whether the new column
* has identical values to an existing column with the same name. If the values match, the column is not added.
*
* @param T The inferred type of the elements in the column.
* @param values The iterable collection of values to be added as a new column.
* @param name The name of the new column. If empty, a unique name will be generated automatically.
* @return The final unique name assigned to the column.
*/
public inline fun <reified T> add(values: Iterable<T>, name: String = ""): String =
add(values.toColumn(name, Infer.Type))

/**
* Retrieves a column from the builder by its name.
*
* @param column The name of the column to retrieve.
* @return The column corresponding to the specified name, or `null` if no such column exists.
*/
public fun get(column: String): AnyCol? = cols[column]

/**
* Converts the current `DynamicDataFrameBuilder` instance into a `DataFrame`.
* The resulting `DataFrame` is constructed from the columns stored in the builder.
*
* @return A `DataFrame` containing the columns defined in the `DynamicDataFrameBuilder`.
*/
public fun toDataFrame(): DataFrame<*> = cols.values.toDataFrame()
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,39 @@ class ConstructorsTests {
@Test
fun `duplicated name`() {
val builder = DynamicDataFrameBuilder()
val column by columnOf(1, 2, 3)
builder.add(column)
builder.add(column)
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(column.name(), "${column.name()}1")
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
fun `get by new name`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
val newName = builder.add(columnB)
builder.get(newName)!!.values shouldBe columnB.values
}

@Test
fun `duplicated column`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
builder.add(columnA)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,29 +413,79 @@ public class DataFrameBuilder(private val header: List<String>) {
}

/**
* Helper class for implementing operations when column names can be potentially duplicated.
* For example, operations involving multiple dataframes, computed columns or parsing some third-party data
* A builder class for dynamically constructing a DataFrame with provided columns.
* Allows adding columns manually while automatically handling duplicate column names by assigning unique names.
*
* @property checkDuplicateValues Whether to check for duplicate column (with identical names and values). If `true`,
* doesn't add a new column if the identical one is already in the builder.
* when adding new columns. `true` by default.
*/
public class DynamicDataFrameBuilder {
private var cols: MutableList<AnyCol> = mutableListOf()
public class DynamicDataFrameBuilder(private val checkDuplicateValues: Boolean = true) {
private var cols: MutableMap<String, AnyCol> = mutableMapOf()
private val generator = ColumnNameGenerator()

/**
* Adds a column to the builder, ensuring its name is unique.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If [checkDuplicateValues] is `true`, the method checks whether the new column has identical values
* to an existing column with the same name. If the values match, the column is not added.
*
* @param col The column to add to the DataFrame builder.
* @return The final unique name assigned to the column.
*/
public fun add(col: AnyCol): String {
val uniqueName = if (col.name().isEmpty()) {
val originalName = col.name()
if (checkDuplicateValues && generator.contains(originalName)) {
if (cols[originalName] == col) return originalName
}
val uniqueName = if (originalName.isEmpty()) {
generator.addUnique(UNNAMED_COLUMN_PREFIX)
} else {
generator.addUnique(col.name())
generator.addUnique(originalName)
}
val renamed = if (uniqueName != col.name()) {
val renamed = if (uniqueName != originalName) {
col.rename(uniqueName)
} else {
col
}
cols.add(renamed)
cols.put(uniqueName, renamed)
return uniqueName
}

public fun toDataFrame(): DataFrame<*> = dataFrameOf(cols)
/**
* Adds a column to the builder from the given iterable of values, ensuring the column's name is unique.
*
* The method automatically converts the given iterable into a column using the specified or default name
* and infers the type of the column's elements.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If the [checkDuplicateValues] property of the builder is `true`, the method checks whether the new column
* has identical values to an existing column with the same name. If the values match, the column is not added.
*
* @param T The inferred type of the elements in the column.
* @param values The iterable collection of values to be added as a new column.
* @param name The name of the new column. If empty, a unique name will be generated automatically.
* @return The final unique name assigned to the column.
*/
public inline fun <reified T> add(values: Iterable<T>, name: String = ""): String =
add(values.toColumn(name, Infer.Type))

/**
* Retrieves a column from the builder by its name.
*
* @param column The name of the column to retrieve.
* @return The column corresponding to the specified name, or `null` if no such column exists.
*/
public fun get(column: String): AnyCol? = cols[column]

/**
* Converts the current [DynamicDataFrameBuilder] instance into a [DataFrame].
* The resulting [DataFrame] is constructed from the columns stored in the builder.
*
* @return A [DataFrame] containing the columns defined in the [DynamicDataFrameBuilder].
*/
public fun toDataFrame(): DataFrame<*> = cols.values.toDataFrame()
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,39 @@ class ConstructorsTests {
@Test
fun `duplicated name`() {
val builder = DynamicDataFrameBuilder()
val column by columnOf(1, 2, 3)
builder.add(column)
builder.add(column)
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(column.name(), "${column.name()}1")
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
fun `get by new name`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
val newName = builder.add(columnB)
builder.get(newName)!!.values shouldBe columnB.values
}

@Test
fun `duplicated column`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
builder.add(columnA)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
Expand Down

0 comments on commit a84c30c

Please sign in to comment.