diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 20f2560153..69075f2f6c 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -36,6 +36,7 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDateTime import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl import org.jetbrains.kotlinx.dataframe.impl.headPlusArray +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.io.toDataFrame import java.math.BigDecimal import java.math.BigInteger @@ -226,8 +227,8 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColum * @include [DataColumnStringConvertToDoubleDoc] * @param nullStrings a set of strings that should be treated as `null` values. * The default in [DataFrame.parser][DataFrame.Companion.parser] is ["null", "NULL", "NA", "N/A"]. - * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser. - * The default in [DataFrame.parser][DataFrame.Companion.parser] is `false` for now. + * @param useFastDoubleParser whether to use [FastDoubleParser]. + * The default in [DataFrame.parser][DataFrame.Companion.parser] is `true`. */ @JvmName("convertToDoubleFromString") public fun DataColumn.convertToDouble( @@ -246,8 +247,8 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColu * @include [DataColumnStringConvertToDoubleDoc] * @param nullStrings a set of strings that should be treated as `null` values. * The default in [DataFrame.parser][DataFrame.Companion.parser] is ["null", "NULL", "NA", "N/A"]. - * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser. - * The default in [DataFrame.parser][DataFrame.Companion.parser] is `false` for now. + * @param useFastDoubleParser whether to use [FastDoubleParser]. + * The default in [DataFrame.parser][DataFrame.Companion.parser] is `true`. */ @JvmName("convertToDoubleFromStringNullable") public fun DataColumn.convertToDouble( diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index c208e2a4ac..b68f234a80 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -11,6 +11,7 @@ import org.jetbrains.kotlinx.dataframe.impl.api.Parsers import org.jetbrains.kotlinx.dataframe.impl.api.StringParser import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.io.readCSV import org.jetbrains.kotlinx.dataframe.typeClass import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS @@ -45,6 +46,12 @@ public fun DataFrame.parse(vararg columns: ColumnReference, options public fun DataFrame.parse(vararg columns: KProperty, options: ParserOptions? = null): DataFrame = parse(options) { columns.toColumnSet() } +/** + * Global counterpart of [ParserOptions]. + * Settings changed here will affect the defaults for all parsing operations. + * + * The default values are set by [Parsers.resetToDefault]. + */ public interface GlobalParserOptions { public fun addDateTimePattern(pattern: String) @@ -54,7 +61,7 @@ public interface GlobalParserOptions { /** This function can be called to skip some types. Parsing will be attempted for all other types. */ public fun addSkipType(type: KType) - /** Whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ + /** Whether to use [FastDoubleParser], defaults to `true`. Please report any issues you encounter. */ public var useFastDoubleParser: Boolean public fun resetToDefault() @@ -91,7 +98,7 @@ public interface GlobalParserOptions { * `["null", "NULL", "NA", "N/A"]`. * @param skipTypes a set of types that should be skipped during parsing. Parsing will be attempted for all other types. * By default, it's an empty set. To skip all types except a specified one, use [convertTo] instead. - * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. + * @param useFastDoubleParser whether to use [FastDoubleParser], defaults to `true`. Please report any issues you encounter. */ public class ParserOptions( public val locale: Locale? = null, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 239c22d5c4..d2da7201f7 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -25,6 +25,7 @@ import org.jetbrains.kotlinx.dataframe.api.isColumnGroup import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf import org.jetbrains.kotlinx.dataframe.api.map +import org.jetbrains.kotlinx.dataframe.api.parser import org.jetbrains.kotlinx.dataframe.api.to import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion import org.jetbrains.kotlinx.dataframe.columns.size @@ -47,6 +48,7 @@ import java.time.format.DateTimeFormatterBuilder import java.time.temporal.Temporal import java.time.temporal.TemporalQuery import java.util.Locale +import kotlin.properties.Delegates import kotlin.reflect.KClass import kotlin.reflect.KType import kotlin.reflect.full.withNullability @@ -114,6 +116,13 @@ internal class StringParserWithFormat( } } +/** + * Central implementation for [GlobalParserOptions]. + * + * Can be obtained by a user by calling [DataFrame.parser][DataFrame.Companion.parser]. + * + * Defaults are set by [resetToDefault]. + */ internal object Parsers : GlobalParserOptions { private val formatters: MutableList = mutableListOf() @@ -140,7 +149,7 @@ internal object Parsers : GlobalParserOptions { skipTypesSet.add(type) } - override var useFastDoubleParser: Boolean = false + override var useFastDoubleParser by Delegates.notNull() private var _locale: Locale? = null @@ -165,7 +174,7 @@ internal object Parsers : GlobalParserOptions { .toFormatter() .let { formatters.add(it) } - useFastDoubleParser = false + useFastDoubleParser = true _locale = null nullStrings.addAll(listOf("null", "NULL", "NA", "N/A")) } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt index 47361dd0a5..815b404cb8 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt @@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols import io.github.oshai.kotlinlogging.KotlinLogging import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.ParserOptions -import org.jetbrains.kotlinx.dataframe.api.parser import org.jetbrains.kotlinx.dataframe.impl.api.Parsers import java.nio.charset.Charset import java.text.DecimalFormatSymbols @@ -15,19 +14,24 @@ import java.util.Locale private val logger = KotlinLogging.logger {} -// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales -private val INFINITIES = arrayOf("∞", "inf", "infinity", "infty") -private val PLUS_INFINITIES = INFINITIES.map { "+$it" } -private val MINUS_INFINITIES = INFINITIES.map { "-$it" } -private val NANS = arrayOf("nan", "na", "n/a") - /** * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double]. * - * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with an _EXPERIMENTAL_ - * fast double parser, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). + * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the + * fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). * If not, or if it fails, it will use [NumberFormat] to parse the input. * + * The [locale][locale] used by the double parser is defined like: + * + * [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault] + * + * [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations + * of any locale recognized by Java as long as that symbol does not conflict with the given locale. + * + * For example, if your locale uses ',' as decimal separator, it will NOT recognize ',' as thousands separator, + * but it will recognize ' ', '٬', '_', ' ', etc. as such. + * The same holds for characters like "e", "inf", "×10^", "NaN", etc. + * * Public, so it can be used in other modules. * * @param parserOptions can be supplied to configure the parser. @@ -41,106 +45,103 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null) private val useFastDoubleParser = parserOptions?.useFastDoubleParser ?: Parsers.useFastDoubleParser private val locale = parserOptions?.locale ?: Parsers.locale - private val fallbackLocale = Locale.ROOT - - private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale) - private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale) private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true) /** * Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on - * [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols]. + * the [locale] with fallbacks from all other locales. * * Fallback characters/strings are only added if they're not clashing with local characters/strings. */ - private fun setupNumberFormatSymbols(): NumberFormatSymbols { - // collect all chars and strings that are locale-specific such that we can check whether - // fallback chars and strings are safe to add - val localChars = with(localDecimalFormatSymbols) { - buildSet { - add(decimalSeparator.lowercaseChar()) - add(groupingSeparator.lowercaseChar()) - add(minusSign.lowercaseChar()) - add('+') - add(zeroDigit.lowercaseChar()) + private fun setupNumberFormatSymbols(): NumberFormatSymbols = + numberFormatSymbolsCache.getOrPut(locale) { + val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale) + + // collect all chars and strings that are locale-specific such that we can check whether + // fallback chars and strings are safe to add + val localChars = with(localDecimalFormatSymbols) { + buildSet { + add(decimalSeparator.lowercaseChar()) + add(groupingSeparator.lowercaseChar()) + add(minusSign.lowercaseChar()) + add('+') + // we don't include zeroDigit here, for notations like ×10^ + } } - } - val localStrings = with(localDecimalFormatSymbols) { - buildSet { - add(exponentSeparator.lowercase()) - add(infinity.lowercase()) - add(naN.lowercase()) + val localStrings = with(localDecimalFormatSymbols) { + buildSet { + add(exponentSeparator.lowercase()) + add(infinity.lowercase()) + add(naN.lowercase()) + } } - } - /** - * Builds a set with the specified char from [localDecimalFormatSymbols] and - * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so. - * [additionals] will be added to the set too, when they're safe to add. - */ - fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set = - buildSet { - val getChar = this@fromLocalWithFallBack - val char = getChar(localDecimalFormatSymbols).lowercaseChar() - add(char) - - // add fallback char if it's safe to do so - val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar() - if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) { - add(fallbackChar) - } + /** + * Builds a set with the specified char from [this] and + * [fallbackChars] will be added to the set too, when they're safe to add. + */ + fun Char.withFallback(fallbackChars: CharArray): Set = + buildSet { + val char = this@withFallback.lowercaseChar() + add(char) - // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. - if (char.isWhitespace()) add(' ') + // Treat NBSP and other whitespace characters the same. + if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable()) - // add additional chars if needed - for (additional in additionals) { - val lowercase = additional.lowercaseChar() - if (lowercase !in localChars && !localStrings.any { lowercase in it }) { - add(lowercase) + // add fallback chars if needed + for (char in fallbackChars) { + val lowercase = char.lowercaseChar() + if (lowercase !in localChars && !localStrings.any { lowercase in it }) { + add(lowercase) + } + + // Treat NBSP and other whitespace characters the same. + if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable()) } } - } - /** - * Builds a set with the specified string from [localDecimalFormatSymbols] and - * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so. - * [additionals] will be added to the set too, when they're safe to add. - */ - fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set = - buildSet { - val getString = this@fromLocalWithFallBack - val string = getString(localDecimalFormatSymbols).lowercase() - add(string) - - // add fallback string if it's safe to do so - val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase() - if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) { - add(fallbackString) - } + /** + * Builds a set with the specified string from [this] and + * [fallbackStrings] will be added to the set too, when they're safe to add. + */ + fun String.withFallback(fallbackStrings: Array): Set = + buildSet { + val string = this@withFallback.lowercase() + add(string) + + // Treat NBSP and other whitespace characters the same. + if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() }) - // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. - if (string.isBlank()) add(" ") + // add fallback strings if needed + for (string in fallbackStrings) { + val lowercase = string.lowercase() + if (!lowercase.any { it in localChars } && lowercase !in localStrings) { + add(lowercase) + } - // add additional strings if needed - for (additional in additionals) { - val lowercase = additional.lowercase() - if (!lowercase.any { it in localChars } && lowercase !in localStrings) { - add(lowercase) + // Treat NBSP and other whitespace characters the same. + if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() }) } } - } - return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols) - .withPlusSign(setOf('+')) - .withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack()) - .withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack()) - .withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack()) - .withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack()) - .withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES)) - .withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS)) - } + NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols) + .withPlusSign( + setOf('+'), + ).withDecimalSeparator( + localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS), + ).withGroupingSeparator( + localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS), + ).withExponentSeparator( + localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS), + ).withMinusSign( + localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS), + ).withInfinity( + localDecimalFormatSymbols.infinity.withFallback(INFINITIES), + ).withNaN( + localDecimalFormatSymbols.naN.withFallback(NANS), + ) + } /** Fallback method for parsing doubles. */ private fun String.parseToDoubleOrNullFallback(): Double? = @@ -152,7 +153,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null) in NANS -> Double.NaN else -> { - // not thread safe; must be created here + // NumberFormat is not thread safe; must be created in the function body val numberFormat = NumberFormat.getInstance(locale) val parsePosition = ParsePosition(0) val result = numberFormat.parse(this, parsePosition)?.toDouble() @@ -235,4 +236,49 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null) } return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback() } + + /** + * Here we store all possible decimal format symbols of all locales on the system. + * These will be used as fallbacks for the selected locale. + * They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale] + * (so ',' is not added as grouping separator if '.' is already the locale's decimal separator). + */ + internal companion object { + private val allDecimalFormatSymbols by lazy { + Locale.getAvailableLocales().map { DecimalFormatSymbols.getInstance(it) } + } + val MINUS_SIGNS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.minusSign }.toCharArray() + } + val INFINITIES by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.infinity } + .plus(arrayOf("∞", "inf", "infinity", "infty")) + .toTypedArray() + } + val PLUS_INFINITIES by lazy { INFINITIES.map { "+$it" }.toTypedArray() } + val MINUS_INFINITIES by lazy { + INFINITIES.flatMap { inf -> MINUS_SIGNS.map { min -> min + inf } }.toTypedArray() + } + val NANS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.naN } + .plus(arrayOf("nan", "na", "n/a")) + .toTypedArray() + } + val WHITE_SPACES = charArrayOf(' ', '\u00A0', '\u2009', '\u202F', '\t') + val GROUPING_SEPARATORS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.groupingSeparator } + .plus(arrayOf('\'', '˙', *WHITE_SPACES.toTypedArray())) + .toCharArray() + } + val DECIMAL_SEPARATORS by lazy { + allDecimalFormatSymbols.flatMapTo(mutableSetOf()) { + listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator) + }.plus(arrayOf('·', '⎖')) + .toCharArray() + } + val EXPONENTS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.exponentSeparator }.toTypedArray() + } + val numberFormatSymbolsCache = mutableMapOf() + } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 64d2ced7b1..553fac5961 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -145,9 +145,9 @@ class ParserTests { parsed.toList() shouldBe listOf(1, 2, null, 3, null, null, 4.0, 5.0) } - @Test // This does not yet use fastDoubleParser! + @Test fun `converting String to Double in different locales`() { - val currentLocale = Locale.getDefault() + val systemLocale = Locale.getDefault() try { // Test 45 behaviour combinations: @@ -157,11 +157,14 @@ class ParserTests { val columnMixed = columnOf("12.345", "67,890") // * // (3 locales as converting parameter + original converting + original converting to nullable) - val parsingLocaleNotDefined: Locale? = null + val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() + // uses dot as decimal separator, comma as grouping separator val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + // uses comma as decimal separator, NBSP as grouping separator val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") // * // 3 system locales + // -------------------------------------------------------------------------------- Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) @@ -181,9 +184,13 @@ class ParserTests { columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) + + // -------------------------------------------------------------------------------- Locale.setDefault(Locale.forLanguageTag("en-US")) @@ -203,33 +210,246 @@ class ParserTests { columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) + + // -------------------------------------------------------------------------------- Locale.setDefault(Locale.forLanguageTag("ru-RU")) columnDot.convertTo() shouldBe columnOf(12.345, 67.89) columnComma.convertTo() shouldBe columnOf(12.345, 67.89) - columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) + // uses fallback mechanism + columnMixed.convertTo() shouldBe columnOf(12.345, 67.89) columnDot.convertTo() shouldBe columnOf(12.345, 67.89) columnComma.convertTo() shouldBe columnOf(12.345, 67.89) - columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) + // uses fallback mechanism + columnMixed.convertTo() shouldBe columnOf(12.345, 67.89) columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) - columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) + } finally { + Locale.setDefault(systemLocale) + } + } + + @Test + fun `converting String to Double in different locales with NBSP grouping`() { + val systemLocale = Locale.getDefault() + try { + // Test 45 behaviour combinations: + + // 3 source columns + val columnDot = columnOf("123 456.789", "0 987 654.321") + val columnComma = columnOf("123 456,789", "0 987 654,321") + val columnMixed = columnOf( + "123 456.789", + "0'987 654,321", // note the use of two different thousands grouping characters + ) + // * + // (3 locales as converting parameter + original converting + original converting to nullable) + val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() + // uses dot as decimal separator, comma as grouping separator + val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + // uses comma as decimal separator, NBSP as grouping separator + val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") + // * + // 3 system locales + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654_321.0) + + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("en-US")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654_321.0) + + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("ru-RU")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + // parses correctly but may be surprising + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654_321.0) + + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + } finally { + Locale.setDefault(systemLocale) + } + } + + @Test + fun `converting String to Double in different locales with comma grouping`() { + val systemLocale = Locale.getDefault() + try { + // Test 45 behaviour combinations: + + // 3 source columns + val columnDot = columnOf("123,456.789", "0,987,654.321") + val columnComma = columnOf("123.456,789", "0.987.654,321") + val columnMixed = columnOf( + "123,456.789", + "0'987.654,321", // note the use of two different thousands grouping characters + ) + // * + // (3 locales as converting parameter + original converting + original converting to nullable) + val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() + val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("nl-NL") + // * + // 3 system locales + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleNotDefined) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleNotDefined) } + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleUsesDot) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesDot) } + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("en-US")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleNotDefined) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleNotDefined) } + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleUsesDot) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesDot) } + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("nl-NL")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertTo() } + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertTo() } + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleNotDefined) } + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleUsesDot) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesDot) } + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } } finally { - Locale.setDefault(currentLocale) + Locale.setDefault(systemLocale) } } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt index aa75feb6a2..c0f39c79c3 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt @@ -138,8 +138,6 @@ internal object DelimParams { * ([DataFrame.parser][DataFrame.Companion.parser]) will be queried. * * The only exceptions are: - * - [useFastDoubleParser][ParserOptions.useFastDoubleParser], which will default to `true`, - * regardless of the global setting. * - [nullStrings][ParserOptions.nullStrings], which, if `null`, * will take the global setting + {@include [DefaultNullStringsContentLink]}. * - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses] to diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt index bf37bc86f3..dec844836c 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt @@ -112,11 +112,7 @@ internal fun readDelimImpl( ): DataFrame<*> { // set up the csv specs val csvSpecs = with(CsvSpecs.builder()) { - // turn on fast double parser if not explicitly set regardless of the global parser options - @Suppress("NullableBooleanElvis") - val adjustedParserOptions = (parserOptions ?: ParserOptions()) - .copy(useFastDoubleParser = parserOptions?.useFastDoubleParser ?: true) - customDoubleParser(DataFrameCustomDoubleParser(adjustedParserOptions)) + customDoubleParser(DataFrameCustomDoubleParser(parserOptions)) // use the given nullStrings if provided, else take the global ones + some extras val nullStrings = parserOptions?.nullStrings ?: (DataFrame.parser.nulls + DEFAULT_DELIM_NULL_STRINGS) diff --git a/docs/StardustDocs/topics/convert.md b/docs/StardustDocs/topics/convert.md index bd7e2088e5..6cbe4e8e5f 100644 --- a/docs/StardustDocs/topics/convert.md +++ b/docs/StardustDocs/topics/convert.md @@ -44,7 +44,7 @@ df.convert { name }.asFrame { it.add("fullName") { "$firstName $lastName" } } * `Int` (and `Char`) * `Long` * `Float` -* `Double` +* `Double` (See [parsing doubles](parse.md#parsing-doubles) for `String` to `Double` conversion) * `BigDecimal` * `BigInteger` * `LocalDateTime` (kotlinx.datetime and java.time) diff --git a/docs/StardustDocs/topics/parse.md b/docs/StardustDocs/topics/parse.md index a8dbd5806e..7a4460b071 100644 --- a/docs/StardustDocs/topics/parse.md +++ b/docs/StardustDocs/topics/parse.md @@ -5,6 +5,10 @@ Returns a [`DataFrame`](DataFrame.md) in which the given `String` columns are pa This is a special case of the [convert](convert.md) operation. +This parsing operation is sometimes executed implicitly, for example, when [reading from CSV](read.md) or +[type converting from `String` columns](convert.md). +You can recognize this by the `locale` or `parserOptions` arguments in these functions. + ```kotlin @@ -25,6 +29,8 @@ df.parse { age and weight } +### Parsing Order + `parse` tries to parse every `String` column into one of supported types in the following order: * `Int` * `Long` @@ -34,16 +40,35 @@ df.parse { age and weight } * `Duration` (`kotlin.time` and `java.time`) * `LocalTime` (`java.time`) * `URL` (`java.net`) -* `Double` (with optional locale settings) +* [`Double` (with optional locale settings)](#parsing-doubles) * `Boolean` * `BigDecimal` * `JSON` (arrays and objects) +### Parser Options + +DataFrame supports multiple parser options that can be used to customize the parsing behavior. +These can be supplied to the `parse` function (or any other function that can implicitly parse `Strings`) +as an argument. + +For each option you don't supply (or supply `null`) DataFrame will take the value from the +[Global Parser Options](#global-parser-options). + Available parser options: -* `locale: Locale` is used to parse doubles +* `locale: Locale` is used to [parse doubles](#parsing-doubles) + * Global default locale is `Locale.getDefault()` * `dateTimePattern: String` is used to parse date and time + * Global default supports ISO (local) date-time * `dateTimeFormatter: DateTimeFormatter` is used to parse date and time -* `nullStrings: List` is used to treat particular strings as `null` value. Default null strings are **"null"** and **"NULL"** + * Is derived from `dateTimePattern` and/or `locale` if `null` +* `nullStrings: List` is used to treat particular strings as `null` value + * Global default null strings are **"null"** and **"NULL"** + * When [reading from CSV](read.md), we include even more defaults, like **""**, and **"NA"**. + See the KDocs there for the exact details +* `skipTypes: Set` types that should be skipped during parsing + * Empty set by global default; parsing can result in any supported type +* `useFastDoubleParser: Boolean` is used to enable or disable the [new fast double parser](#parsing-doubles) + * Enabled by global default @@ -54,8 +79,14 @@ df.parse(options = ParserOptions(locale = Locale.CHINA, dateTimeFormatter = Date -You can also set global parser options that will be used by default in [`read`](read.md), [`convert`](convert.md), -and `parse` operations: +### Global Parser Options + +As mentioned before, you can change the default global parser options that will be used by [`read`](read.md), +[`convert`](convert.md), and other `parse` operations. +Whenever you don't explicitly provide [parser options](#parser-options) to a function call, +DataFrame will use these global options instead. + +For example, to change the locale to French and add a custom date-time pattern for all following DataFrame calls, do: @@ -64,4 +95,43 @@ DataFrame.parser.locale = Locale.FRANCE DataFrame.parser.addDateTimePattern("dd.MM.uuuu HH:mm:ss") ``` +For `locale`, this means that the one being used by the parser is defined as: + +↪ The locale given as function argument directly, or in `parserOptions`, if it is not `null`, else + +    ↪ The locale set by `DataFrame.parser.locale = ...`, if it is not `null`, else + +        ↪ `Locale.getDefault()`, which is the system's default locale that can be changed with `Locale.setDefault()`. + +### Parsing Doubles + +DataFrame has a new fast and powerful double parser enabled by default. +It is based on [the FastDoubleParser library](https://github.com/wrandelshofer/FastDoubleParser) for its +high performance and configurability +(in the future, we might expand this support to `Float`, `BigDecimal`, and `BigInteger` as well). + +The parser is locale-aware; it will use the locale set by the +[(global)](#global-parser-options) [parser options](#parser-options) to parse the doubles. +It also has a fallback mechanism built in, meaning it can recognize characters from +all other locales (and some from [Wikipedia](https://en.wikipedia.org/wiki/Decimal_separator)) +and parse them correctly as long as they don't conflict with the current locale. + +For example, if your locale uses ',' as decimal separator, it will not recognize ',' as thousands separator, but it will +recognize ''', ' ', '٬', '_', ' ', etc. as such. +The same holds for characters like "e", "inf", "×10^", "NaN", etc. (ignoring case). + +This means you can safely parse `"123'456 789,012.345×10^6"` with a US locale but not `"1.234,5"`. + +Aside from this, DataFrame also explicitly recognizes "∞", "inf", "infinity", and "infty" as `Double.POSITIVE_INFINITY` +(as well as their negative counterparts), "nan", "na", and "n/a" as `Double.NaN`, +and all forms of whitespace are treated equally. + +If `FastDoubleParser` fails to parse a `String` as `Double`, DataFrame will try +to parse it using the standard `NumberFormat.parse()` function as a last resort. + +If you experience any issues with the new parser, you can turn it off by setting +`useFastDoubleParser = false`, which will use the old `NumberFormat.parse()` function instead. + +Please [report](https://github.com/Kotlin/dataframe/issues) any issues you encounter. +