Skip to content

Commit c5ed408

Browse files
dengzimingcloud-fan
authored andcommitted
[SPARK-52545][SQL] Standardize double-quote escaping to follow SQL specification
### What changes were proposed in this pull request? This PR standardizes Spark's string literal parsing behavior to align with other database systems (MySQL, Hive, etc.) by implementing proper quote escaping rules: Previous Behavior (Inconsistent): ``` spark.sql("""SELECT " ""aa"" " AS f1, " \"aa\" " AS f2, '""aa""' as f3""".stripMargin).show() +----+------+------+ | f1| f2| f3| +----+------+------+ | aa | "aa" |""aa""| +----+------+------+ ``` New Behavior (Standard-Compliant): ``` spark.sql("""SELECT " ""aa"" " AS f1, " \"aa\" " AS f2, '""aa""' as f3""".stripMargin).show() +------+------+------+ | f1| f2| f3| +------+------+------+ | "aa" | "aa" |""aa""| +------+------+------+ ``` ### Why are the changes needed? Current behavior incorrectly treats consecutive quotes as string concatenation rather than escaping, now aligns with major databases: MySQL: [Reference](https://www.atlassian.com/data/sql/single-double-quote-and-backticks-in-mysql-queries) Hive: [Hplsql.g4 implementation](https://github.com/apache/hive/blob/master/hplsql/src/main/antlr4/org/apache/hive/hplsql/Hplsql.g4#L1997-L1999) ### Does this PR introduce _any_ user-facing change? Yes. 1. New consecutive quote parsing, now: 'a''b' → a'b (quotes treated as escapes), previously: 'a''b' → ab (quotes treated as literal separators for string concatenation), legacy behavior can be restored via: `SET spark.sql.legacy.consecutiveStringLiterals.enabled=true`; 2. Double-quote escaping ("", '') is now consistently supported in all string literal contexts, such as column comments, Partition directory names, and other SQL string literals; 3. ANSI Identifier Enhancement, double-quoted identifiers now support quote escaping in ANSI mode: CREATE TABLE "t" ("I""d" INT), note that while parsing allows escaped quotes, some downstream analysis may reject identifiers containing special characters (e.g., " in table names). ### How was this patch tested? Added some SQL tests based on the existing test file. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51242 from dengziming/SPARK-52545. Authored-by: dengziming <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 591e1c3 commit c5ed408

File tree

18 files changed

+393
-12
lines changed

18 files changed

+393
-12
lines changed

sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -548,13 +548,13 @@ HENT_END: '*/';
548548
QUESTION: '?';
549549

550550
STRING_LITERAL
551-
: '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
551+
: '\'' ( ~('\''|'\\') | ('\\' .) | ('\'' '\'') )* '\''
552552
| 'R\'' (~'\'')* '\''
553553
| 'R"'(~'"')* '"'
554554
;
555555

556556
DOUBLEQUOTED_STRING
557-
:'"' ( ~('"'|'\\') | ('\\' .) )* '"'
557+
:'"' ( ~('"'|'\\') | '""' | ('\\' .) )* '"'
558558
;
559559

560560
// NOTE: If you move a numeric literal, you should modify `ParserUtils.toExprAlias()`

sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkParserUtils.scala

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,19 @@ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
2626

2727
trait SparkParserUtils {
2828

29-
/** Unescape backslash-escaped string enclosed by quotes. */
30-
def unescapeSQLString(b: String): String = {
29+
/**
30+
* Unescape escaped string enclosed by quotes, with support for:
31+
* 1. Double-quote escaping (`""`, `''`)
32+
* 2. Traditional backslash escaping (\n, \t, \", etc.)
33+
*
34+
* @param b
35+
* The input string
36+
* @param ignoreQuoteQuote
37+
* If true, consecutive quotes (`''` or `""`) are treated as string concatenation and will be
38+
* removed directly (e.g., `'a''b'` → `ab`). If false, they are treated as escape sequences
39+
* (e.g., `'a''b'` → `a'b`). Default is false (standard SQL escaping).
40+
*/
41+
def unescapeSQLString(b: String, ignoreQuoteQuote: Boolean = false): String = {
3142
def appendEscapedChar(n: Char, sb: JStringBuilder): Unit = {
3243
n match {
3344
case '0' => sb.append('\u0000')
@@ -71,10 +82,20 @@ trait SparkParserUtils {
7182
firstChar == 'r' || firstChar == 'R'
7283
}
7384

85+
val isDoubleQuotedString = {
86+
b.charAt(0) == '"'
87+
}
88+
89+
val isSingleQuotedString = {
90+
b.charAt(0) == '\''
91+
}
92+
7493
if (isRawString) {
7594
// Skip the 'r' or 'R' and the first and last quotations enclosing the string literal.
7695
b.substring(2, b.length - 1)
77-
} else if (b.indexOf('\\') == -1) {
96+
} else if (b.indexOf('\\') == -1 &&
97+
(!isDoubleQuotedString || b.indexOf("\"\"") == -1) &&
98+
(!isSingleQuotedString || b.indexOf("''") == -1)) {
7899
// Fast path for the common case where the string has no escaped characters,
79100
// in which case we just skip the first and last quotations enclosing the string literal.
80101
b.substring(1, b.length - 1)
@@ -85,7 +106,19 @@ trait SparkParserUtils {
85106
val length = b.length - 1
86107
while (i < length) {
87108
val c = b.charAt(i)
88-
if (c != '\\' || i + 1 == length) {
109+
// First check for double-quote escaping (`""`, `''`)
110+
if (isDoubleQuotedString && c == '"' && i + 1 < length && b.charAt(i + 1) == '"') {
111+
if (!ignoreQuoteQuote) {
112+
sb.append('"')
113+
}
114+
i += 2
115+
} else if (isSingleQuotedString && c == '\'' && i + 1 < length && b.charAt(
116+
i + 1) == '\'') {
117+
if (!ignoreQuoteQuote) {
118+
sb.append('\'')
119+
}
120+
i += 2
121+
} else if (c != '\\' || i + 1 == length) {
89122
// Either a regular character or a backslash at the end of the string:
90123
sb.append(c)
91124
i += 1
@@ -138,6 +171,9 @@ trait SparkParserUtils {
138171
/** Convert a string token into a string. */
139172
def string(token: Token): String = unescapeSQLString(token.getText)
140173

174+
/** Convert a string token into a string and remove `""` and `''`. */
175+
def stringIgnoreQuoteQuote(token: Token): String = unescapeSQLString(token.getText, true)
176+
141177
/** Convert a string node into a string. */
142178
def string(node: TerminalNode): String = unescapeSQLString(node.getText)
143179

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,14 @@ import org.apache.spark.sql.catalyst.plans.logical._
4444
import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
4545
import org.apache.spark.sql.catalyst.trees.TreePattern.PARAMETER
4646
import org.apache.spark.sql.catalyst.types.DataTypeUtils
47-
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, DateTimeUtils, IntervalUtils, SparkParserUtils}
47+
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, DateTimeUtils, IntervalUtils}
4848
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, convertSpecialTimestamp, convertSpecialTimestampNTZ, getZoneId, stringToDate, stringToTime, stringToTimestamp, stringToTimestampWithoutTimeZone}
4949
import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces, TableCatalog, TableWritePrivilege}
5050
import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition
5151
import org.apache.spark.sql.connector.expressions.{ApplyTransform, BucketTransform, DaysTransform, Expression => V2Expression, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, Transform, YearsTransform}
5252
import org.apache.spark.sql.errors.{DataTypeErrorsBase, QueryCompilationErrors, QueryParsingErrors, SqlScriptingErrors}
5353
import org.apache.spark.sql.internal.SQLConf
54-
import org.apache.spark.sql.internal.SQLConf.LEGACY_BANG_EQUALS_NOT
54+
import org.apache.spark.sql.internal.SQLConf.{LEGACY_BANG_EQUALS_NOT, LEGACY_CONSECUTIVE_STRING_LITERALS}
5555
import org.apache.spark.sql.types._
5656
import org.apache.spark.sql.util.CaseInsensitiveStringMap
5757
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -130,7 +130,7 @@ class AstBuilder extends DataTypeAstBuilder
130130
* @return The original input text, including all whitespaces and formatting.
131131
*/
132132
private def getOriginalText(ctx: ParserRuleContext): String = {
133-
SparkParserUtils.source(ctx)
133+
source(ctx)
134134
}
135135

136136
/**
@@ -1825,7 +1825,7 @@ class AstBuilder extends DataTypeAstBuilder
18251825
// syntax error here accordingly.
18261826
val error: String = (if (n.name != null) n.name else n.identifierList).getText
18271827
throw new ParseException(
1828-
command = Some(SparkParserUtils.command(n)),
1828+
command = Some(command(n)),
18291829
start = Origin(),
18301830
errorClass = "PARSE_SYNTAX_ERROR",
18311831
messageParameters = Map(
@@ -3642,6 +3642,8 @@ class AstBuilder extends DataTypeAstBuilder
36423642
private def createString(ctx: StringLiteralContext): String = {
36433643
if (conf.escapedStringLiterals) {
36443644
ctx.stringLit.asScala.map(x => stringWithoutUnescape(visitStringLit(x))).mkString
3645+
} else if (conf.getConf(LEGACY_CONSECUTIVE_STRING_LITERALS)) {
3646+
ctx.stringLit.asScala.map(x => stringIgnoreQuoteQuote(visitStringLit(x))).mkString
36453647
} else {
36463648
ctx.stringLit.asScala.map(x => string(visitStringLit(x))).mkString
36473649
}

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4096,6 +4096,16 @@ object SQLConf {
40964096
.booleanConf
40974097
.createWithDefault(false)
40984098

4099+
val LEGACY_CONSECUTIVE_STRING_LITERALS =
4100+
buildConf("spark.sql.legacy.consecutiveStringLiterals.enabled")
4101+
.internal()
4102+
.doc("When true, consecutive string literals separated by double quotes (e.g. 'a''b') will " +
4103+
"be parsed as concatenated strings. This preserves pre-Spark 4.0 behavior where" +
4104+
"'a''b' would be parsed as 'ab' instead of 'a'b'.")
4105+
.version("4.1.0")
4106+
.booleanConf
4107+
.createWithDefault(false)
4108+
40994109
val ANSI_RELATION_PRECEDENCE = buildConf("spark.sql.ansi.relationPrecedence")
41004110
.doc(s"When true and '${ANSI_ENABLED.key}' is true, JOIN takes precedence over comma when " +
41014111
"combining relation. For example, `t1, t2 JOIN t3` should result to `t1 X (t2 X t3)`. If " +

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,16 @@ class ParserUtilsSuite extends SparkFunSuite {
143143
// Guard against off-by-one errors in the "all chars are hex" routine:
144144
assert(unescapeSQLString("\"abc\\uAAAXa\"") == "abcuAAAXa")
145145

146+
// Double-quote escaping ("", '')
147+
assert(unescapeSQLString(""" "a""a" """.trim) == """ a"a """.trim)
148+
assert(unescapeSQLString(""" "a""a" """.trim, true) == "aa")
149+
assert(unescapeSQLString(""" 'a''a' """.trim) == "a'a")
150+
assert(unescapeSQLString(""" 'a''a' """.trim, true) == "aa")
151+
// Single-quoted double quote string or double-quoted single quote string isn't affected
152+
assert(unescapeSQLString(""" 'a""a' """.trim) == """ a""a """.trim)
153+
assert(unescapeSQLString(""" 'a""a' """.trim, true) == """ a""a """.trim)
154+
assert(unescapeSQLString(""" "a''a" """.trim) == "a''a")
155+
assert(unescapeSQLString(""" "a''a" """.trim, true) == "a''a")
146156
// scalastyle:on nonascii
147157
}
148158

sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers-enabled.sql.out

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,26 @@ org.apache.spark.sql.AnalysisException
154154
}
155155

156156

157+
-- !query
158+
select 1 from "not_""exists"
159+
-- !query analysis
160+
org.apache.spark.sql.catalyst.ExtendedAnalysisException
161+
{
162+
"errorClass" : "TABLE_OR_VIEW_NOT_FOUND",
163+
"sqlState" : "42P01",
164+
"messageParameters" : {
165+
"relationName" : "`not_\"exists`"
166+
},
167+
"queryContext" : [ {
168+
"objectType" : "",
169+
"objectName" : "",
170+
"startIndex" : 15,
171+
"stopIndex" : 28,
172+
"fragment" : "\"not_\"\"exists\""
173+
} ]
174+
}
175+
176+
157177
-- !query
158178
SELECT 1 FROM `hello`
159179
-- !query analysis

sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers.sql.out

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,20 @@ org.apache.spark.sql.catalyst.parser.ParseException
111111
}
112112

113113

114+
-- !query
115+
select 1 from "not_""exists"
116+
-- !query analysis
117+
org.apache.spark.sql.catalyst.parser.ParseException
118+
{
119+
"errorClass" : "PARSE_SYNTAX_ERROR",
120+
"sqlState" : "42601",
121+
"messageParameters" : {
122+
"error" : "'\"not_\"\"exists\"'",
123+
"hint" : ""
124+
}
125+
}
126+
127+
114128
-- !query
115129
SELECT 1 FROM `hello`
116130
-- !query analysis

sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,3 +706,51 @@ select -0, -0.0
706706
-- !query analysis
707707
Project [0 AS 0#x, 0.0 AS 0.0#x]
708708
+- OneRowRelation
709+
710+
711+
-- !query
712+
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
713+
-- !query analysis
714+
Project [S"par"k AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
715+
+- OneRowRelation
716+
717+
718+
-- !query
719+
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
720+
-- !query analysis
721+
Project [S'par'k AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
722+
+- OneRowRelation
723+
724+
725+
-- !query
726+
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
727+
-- !query analysis
728+
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
729+
+- OneRowRelation
730+
731+
732+
-- !query
733+
SET spark.sql.legacy.consecutiveStringLiterals.enabled=true
734+
-- !query analysis
735+
SetCommand (spark.sql.legacy.consecutiveStringLiterals.enabled,Some(true))
736+
737+
738+
-- !query
739+
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
740+
-- !query analysis
741+
Project [Spark AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
742+
+- OneRowRelation
743+
744+
745+
-- !query
746+
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
747+
-- !query analysis
748+
Project [Spark AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
749+
+- OneRowRelation
750+
751+
752+
-- !query
753+
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
754+
-- !query analysis
755+
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
756+
+- OneRowRelation

sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/double-quoted-identifiers.sql.out

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,20 @@ org.apache.spark.sql.catalyst.parser.ParseException
111111
}
112112

113113

114+
-- !query
115+
select 1 from "not_""exists"
116+
-- !query analysis
117+
org.apache.spark.sql.catalyst.parser.ParseException
118+
{
119+
"errorClass" : "PARSE_SYNTAX_ERROR",
120+
"sqlState" : "42601",
121+
"messageParameters" : {
122+
"error" : "'\"not_\"\"exists\"'",
123+
"hint" : ""
124+
}
125+
}
126+
127+
114128
-- !query
115129
SELECT 1 FROM `hello`
116130
-- !query analysis

sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/literals.sql.out

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,3 +706,51 @@ select -0, -0.0
706706
-- !query analysis
707707
Project [0 AS 0#x, 0.0 AS 0.0#x]
708708
+- OneRowRelation
709+
710+
711+
-- !query
712+
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
713+
-- !query analysis
714+
Project [S"par"k AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
715+
+- OneRowRelation
716+
717+
718+
-- !query
719+
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
720+
-- !query analysis
721+
Project [S'par'k AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
722+
+- OneRowRelation
723+
724+
725+
-- !query
726+
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
727+
-- !query analysis
728+
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
729+
+- OneRowRelation
730+
731+
732+
-- !query
733+
SET spark.sql.legacy.consecutiveStringLiterals.enabled=true
734+
-- !query analysis
735+
SetCommand (spark.sql.legacy.consecutiveStringLiterals.enabled,Some(true))
736+
737+
738+
-- !query
739+
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
740+
-- !query analysis
741+
Project [Spark AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
742+
+- OneRowRelation
743+
744+
745+
-- !query
746+
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
747+
-- !query analysis
748+
Project [Spark AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
749+
+- OneRowRelation
750+
751+
752+
-- !query
753+
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
754+
-- !query analysis
755+
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
756+
+- OneRowRelation

0 commit comments

Comments
 (0)