add flag

wengh · wengh · commit bf9e7079fa1a · 2025-04-08T15:36:35.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3548,6 +3548,15 @@ object SQLConf {
       // show full stacktrace in tests but hide in production by default.
       .createWithDefault(!Utils.isTesting)
 
+  val PYSPARK_ARROW_VALIDATE_SCHEMA =
+    buildConf("spark.sql.execution.arrow.pyspark.validateSchema.enabled")
+      .doc(
+        "When true, validate the schema of Arrow batches returned by mapInArrow, mapInPandas " +
+        "and DataSource against the expected schema to ensure that they are compatible.")
+      .version("4.1.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val PYTHON_UDF_ARROW_ENABLED =
     buildConf("spark.sql.execution.pythonUDF.arrow.enabled")
       .doc("Enable Arrow optimization in regular Python UDFs. This optimization " +
@@ -6448,6 +6457,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
 
   def pysparkSimplifiedTraceback: Boolean = getConf(PYSPARK_SIMPLIFIED_TRACEBACK)
 
+  def pysparkArrowValidateSchema: Boolean = getConf(PYSPARK_ARROW_VALIDATE_SCHEMA)
+
   def pandasGroupedMapAssignColumnsByName: Boolean =
     getConf(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInBatchEvaluatorFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInBatchEvaluatorFactory.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
@@ -77,17 +78,14 @@ class MapInBatchEvaluatorFactory(
       val unsafeProj = UnsafeProjection.create(output, output)
 
       columnarBatchIter.flatMap { batch =>
-        // Ensure the schema matches the expected schema
-        val actualSchema = batch.column(0).dataType()
-        val strictCheck = true
-        val isCompatible = if (strictCheck) {
-          DataType.equalsIgnoreNullability(actualSchema, outputSchema)
-        } else {
-          outputSchema.sameType(actualSchema)
-        }
-        if (!isCompatible) {
-          throw QueryExecutionErrors.arrowDataTypeMismatchError(
-            PythonEvalType.toString(pythonEvalType), Seq(outputSchema), Seq(actualSchema))
+        if (SQLConf.get.pysparkArrowValidateSchema) {
+          // Ensure the schema matches the expected schema
+          val actualSchema = batch.column(0).dataType()
+          val isCompatible = DataType.equalsIgnoreCompatibleNullability(actualSchema, outputSchema)
+          if (!isCompatible) {
+            throw QueryExecutionErrors.arrowDataTypeMismatchError(
+              PythonEvalType.toString(pythonEvalType), Seq(outputSchema), Seq(actualSchema))
+          }
         }
 
         // Scalar Iterator UDF returns a StructType column in ColumnarBatch, select