apache · tianhanhu · Jun 20, 2025 · cloud-fan · Jun 23, 2025 · HyukjinKwon
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -55,7 +55,7 @@ class JSONOptions(
   private val maxStringLen: Int = parameters
     .get("maxStringLen")
     .map(_.toInt)
-    .getOrElse(StreamReadConstraints.DEFAULT_MAX_STRING_LEN)
+    .getOrElse(SQLConf.get.getConf(SQLConf.JSON_MAX_STRING_LENGTH))
 
   def this(
     parameters: Map[String, String],

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5016,6 +5016,14 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val JSON_MAX_STRING_LENGTH =
+    buildConf("spark.sql.json.defaultMaxStringLength")
+      .doc("Global default maximum string length limit when reading JSON data. It will be " +
+        "overridden if a JSONOption maxStringLen is provided.")
+      .version("3.5.0")
+      .intConf
+      .createWithDefault(Int.MaxValue)
+
   val VARIANT_ALLOW_DUPLICATE_KEYS =
     buildConf("spark.sql.variant.allowDuplicateKeys")
       .internal()

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -25,7 +25,7 @@ import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period, ZoneId}
 import java.util.Locale
 import java.util.concurrent.atomic.AtomicLong
 
-import com.fasterxml.jackson.core.JsonFactory
+import com.fasterxml.jackson.core.{JsonFactory, StreamReadConstraints}
 import org.apache.commons.lang3.exception.ExceptionUtils
 import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.hadoop.io.SequenceFile.CompressionType
@@ -4096,6 +4096,49 @@ abstract class JsonSuite
         Row("{null, bad json}"), Row("{[1,2,3], null}"))
     )
   }
+
+  test("Test JSON data source maxStringLen option") {
+    // Create a JSON string that is way longer than DEFAULT_MAX_STRING_LEN.
+    val longStringSize = StreamReadConstraints.DEFAULT_MAX_STRING_LEN * 10
+    val longString: String = "a" * longStringSize
+    val longStringJson = s"""{ "longString": "$longString" }"""
+
+    withTempDir { tmpDir =>
+      // Create a JSON file with a string that is longer than DEFAULT_MAX_STRING_LEN.
+      val inputPath = new File(tmpDir, "input.json").toPath
+      Files.write(inputPath, longStringJson.getBytes)
+
+      // With JSON_MAX_STRING_LENGTH set to Int.max, should be able to read and write
+      // the long string.
+      withSQLConf(SQLConf.JSON_MAX_STRING_LENGTH.key -> s"${Int.MaxValue}") {
+        val df = spark.read.schema("longString string")
+          .json(inputPath.toString)
+        assert(df.collect() === Row(longString) :: Nil)
+
+         val e = intercept[SparkException] {
+          spark.read.schema("longString string")
+            .option("maxStringLen", 10)
+            .option("mode", "FAILFAST")
+            .json(inputPath.toString)
+            .collect()
+        }
+        assert(e.getCause.getMessage.contains(
+          "Malformed records are detected in record parsing"))
+      }
+
+      withSQLConf(SQLConf.JSON_MAX_STRING_LENGTH.key -> s"${longStringSize - 1}") {
+        val e = intercept[SparkException] {
+          spark.read.schema("longString string")
+            .option("mode", "FAILFAST")
+            .json(inputPath.toString)
+            .collect()
+        }
+        assert(e.getCondition.startsWith("FAILED_READ_FILE"))
+        assert(e.getCause.getMessage.contains(
+          "Malformed records are detected in record parsing"))
+      }
+    }
+  }
 }
 
 class JsonV1Suite extends JsonSuite {