Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -548,13 +548,13 @@ HENT_END: '*/';
QUESTION: '?';

STRING_LITERAL
: '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
: '\'' ( ~('\''|'\\') | ('\\' .) | ('\'' '\'') )* '\''
| 'R\'' (~'\'')* '\''
| 'R"'(~'"')* '"'
;

DOUBLEQUOTED_STRING
:'"' ( ~('"'|'\\') | ('\\' .) )* '"'
:'"' ( ~('"'|'\\') | '""' | ('\\' .) )* '"'
;

// NOTE: If you move a numeric literal, you should modify `ParserUtils.toExprAlias()`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,19 @@ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}

trait SparkParserUtils {

/** Unescape backslash-escaped string enclosed by quotes. */
def unescapeSQLString(b: String): String = {
/**
* Unescape escaped string enclosed by quotes, with support for:
* 1. Double-quote escaping (`""`, `''`)
* 2. Traditional backslash escaping (\n, \t, \", etc.)
*
* @param b
* The input string
* @param ignoreQuoteQuote
* If true, consecutive quotes (`''` or `""`) are treated as string concatenation and will be
* removed directly (e.g., `'a''b'` → `ab`). If false, they are treated as escape sequences
* (e.g., `'a''b'` → `a'b`). Default is false (standard SQL escaping).
*/
def unescapeSQLString(b: String, ignoreQuoteQuote: Boolean = false): String = {
def appendEscapedChar(n: Char, sb: JStringBuilder): Unit = {
n match {
case '0' => sb.append('\u0000')
Expand Down Expand Up @@ -71,10 +82,20 @@ trait SparkParserUtils {
firstChar == 'r' || firstChar == 'R'
}

val isDoubleQuotedString = {
b.charAt(0) == '"'
}

val isSingleQuotedString = {
b.charAt(0) == '\''
}

if (isRawString) {
// Skip the 'r' or 'R' and the first and last quotations enclosing the string literal.
b.substring(2, b.length - 1)
} else if (b.indexOf('\\') == -1) {
} else if (b.indexOf('\\') == -1 &&
(!isDoubleQuotedString || b.indexOf("\"\"") == -1) &&
(!isSingleQuotedString || b.indexOf("''") == -1)) {
// Fast path for the common case where the string has no escaped characters,
// in which case we just skip the first and last quotations enclosing the string literal.
b.substring(1, b.length - 1)
Expand All @@ -85,7 +106,19 @@ trait SparkParserUtils {
val length = b.length - 1
while (i < length) {
val c = b.charAt(i)
if (c != '\\' || i + 1 == length) {
// First check for double-quote escaping (`""`, `''`)
if (isDoubleQuotedString && c == '"' && i + 1 < length && b.charAt(i + 1) == '"') {
if (!ignoreQuoteQuote) {
sb.append('"')
}
i += 2
} else if (isSingleQuotedString && c == '\'' && i + 1 < length && b.charAt(
i + 1) == '\'') {
if (!ignoreQuoteQuote) {
sb.append('\'')
}
i += 2
} else if (c != '\\' || i + 1 == length) {
// Either a regular character or a backslash at the end of the string:
sb.append(c)
i += 1
Expand Down Expand Up @@ -138,6 +171,9 @@ trait SparkParserUtils {
/** Convert a string token into a string. */
def string(token: Token): String = unescapeSQLString(token.getText)

/** Convert a string token into a string and remove `""` and `''`. */
def stringIgnoreQuoteQuote(token: Token): String = unescapeSQLString(token.getText, true)

/** Convert a string node into a string. */
def string(node: TerminalNode): String = unescapeSQLString(node.getText)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
import org.apache.spark.sql.catalyst.trees.TreePattern.PARAMETER
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, DateTimeUtils, IntervalUtils, SparkParserUtils}
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, DateTimeUtils, IntervalUtils}
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, convertSpecialTimestamp, convertSpecialTimestampNTZ, getZoneId, stringToDate, stringToTime, stringToTimestamp, stringToTimestampWithoutTimeZone}
import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces, TableCatalog, TableWritePrivilege}
import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition
import org.apache.spark.sql.connector.expressions.{ApplyTransform, BucketTransform, DaysTransform, Expression => V2Expression, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, Transform, YearsTransform}
import org.apache.spark.sql.errors.{DataTypeErrorsBase, QueryCompilationErrors, QueryParsingErrors, SqlScriptingErrors}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LEGACY_BANG_EQUALS_NOT
import org.apache.spark.sql.internal.SQLConf.{LEGACY_BANG_EQUALS_NOT, LEGACY_CONSECUTIVE_STRING_LITERALS}
import org.apache.spark.sql.types._
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
Expand Down Expand Up @@ -128,7 +128,7 @@ class AstBuilder extends DataTypeAstBuilder
* @return The original input text, including all whitespaces and formatting.
*/
private def getOriginalText(ctx: ParserRuleContext): String = {
SparkParserUtils.source(ctx)
source(ctx)
}

/**
Expand Down Expand Up @@ -1823,7 +1823,7 @@ class AstBuilder extends DataTypeAstBuilder
// syntax error here accordingly.
val error: String = (if (n.name != null) n.name else n.identifierList).getText
throw new ParseException(
command = Some(SparkParserUtils.command(n)),
command = Some(command(n)),
start = Origin(),
errorClass = "PARSE_SYNTAX_ERROR",
messageParameters = Map(
Expand Down Expand Up @@ -3622,6 +3622,8 @@ class AstBuilder extends DataTypeAstBuilder
private def createString(ctx: StringLiteralContext): String = {
if (conf.escapedStringLiterals) {
ctx.stringLit.asScala.map(x => stringWithoutUnescape(visitStringLit(x))).mkString
} else if (conf.getConf(LEGACY_CONSECUTIVE_STRING_LITERALS)) {
ctx.stringLit.asScala.map(x => stringIgnoreQuoteQuote(visitStringLit(x))).mkString
} else {
ctx.stringLit.asScala.map(x => string(visitStringLit(x))).mkString
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4078,6 +4078,16 @@ object SQLConf {
.booleanConf
.createWithDefault(false)

val LEGACY_CONSECUTIVE_STRING_LITERALS =
buildConf("spark.sql.legacy.consecutiveStringLiterals.enabled")
.internal()
.doc("When true, consecutive string literals separated by double quotes (e.g. 'a''b') will " +
"be parsed as concatenated strings. This preserves pre-Spark 4.0 behavior where" +
"'a''b' would be parsed as 'ab' instead of 'a'b'.")
.version("4.1.0")
.booleanConf
.createWithDefault(false)

val ANSI_RELATION_PRECEDENCE = buildConf("spark.sql.ansi.relationPrecedence")
.doc(s"When true and '${ANSI_ENABLED.key}' is true, JOIN takes precedence over comma when " +
"combining relation. For example, `t1, t2 JOIN t3` should result to `t1 X (t2 X t3)`. If " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ class ParserUtilsSuite extends SparkFunSuite {
// Guard against off-by-one errors in the "all chars are hex" routine:
assert(unescapeSQLString("\"abc\\uAAAXa\"") == "abcuAAAXa")

// Double-quote escaping ("", '')
assert(unescapeSQLString(""" "a""a" """.trim) == """ a"a """.trim)
assert(unescapeSQLString(""" "a""a" """.trim, true) == "aa")
assert(unescapeSQLString(""" 'a''a' """.trim) == "a'a")
assert(unescapeSQLString(""" 'a''a' """.trim, true) == "aa")
// Single-quoted double quote string or double-quoted single quote string isn't affected
assert(unescapeSQLString(""" 'a""a' """.trim) == """ a""a """.trim)
assert(unescapeSQLString(""" 'a""a' """.trim, true) == """ a""a """.trim)
assert(unescapeSQLString(""" "a''a" """.trim) == "a''a")
assert(unescapeSQLString(""" "a''a" """.trim, true) == "a''a")
// scalastyle:on nonascii
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,26 @@ org.apache.spark.sql.AnalysisException
}


-- !query
select 1 from "not_""exists"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR updates the string literal parsing. We don't need to test so many different SQL statements that contain strig literal, but simply SELECT string literal and focus more on the string literal itself

SELECT 'a''b'
SELECT 'a' 'b'
...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's indeed overkill to test these double-quoted identifiers here so I removed most of the unrelated tests, but I still remained select 1 from "not_""exists" since this PR also changed the behavior when handling double-quoted identifiers.

-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "TABLE_OR_VIEW_NOT_FOUND",
"sqlState" : "42P01",
"messageParameters" : {
"relationName" : "`not_\"exists`"
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 15,
"stopIndex" : 28,
"fragment" : "\"not_\"\"exists\""
} ]
}


-- !query
SELECT 1 FROM `hello`
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,20 @@ org.apache.spark.sql.catalyst.parser.ParseException
}


-- !query
select 1 from "not_""exists"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have similar behavior for the backtick quoting?

Copy link
Member Author

@dengziming dengziming Jun 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, Spark already supports using double backtick inside a back-quoted identifier. Here is the grammar in SqlBaseLexer.g4 (BACKQUOTED_IDENTIFIER : '' ( ~'' | '``' )* '`' ;):
image

-- !query analysis
org.apache.spark.sql.catalyst.parser.ParseException
{
"errorClass" : "PARSE_SYNTAX_ERROR",
"sqlState" : "42601",
"messageParameters" : {
"error" : "'\"not_\"\"exists\"'",
"hint" : ""
}
}


-- !query
SELECT 1 FROM `hello`
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -706,3 +706,51 @@ select -0, -0.0
-- !query analysis
Project [0 AS 0#x, 0.0 AS 0.0#x]
+- OneRowRelation


-- !query
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
-- !query analysis
Project [S"par"k AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
+- OneRowRelation


-- !query
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
-- !query analysis
Project [S'par'k AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
+- OneRowRelation


-- !query
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
-- !query analysis
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
+- OneRowRelation


-- !query
SET spark.sql.legacy.consecutiveStringLiterals.enabled=true
-- !query analysis
SetCommand (spark.sql.legacy.consecutiveStringLiterals.enabled,Some(true))


-- !query
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
-- !query analysis
Project [Spark AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
+- OneRowRelation


-- !query
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
-- !query analysis
Project [Spark AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
+- OneRowRelation


-- !query
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
-- !query analysis
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
+- OneRowRelation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,20 @@ org.apache.spark.sql.catalyst.parser.ParseException
}


-- !query
select 1 from "not_""exists"
-- !query analysis
org.apache.spark.sql.catalyst.parser.ParseException
{
"errorClass" : "PARSE_SYNTAX_ERROR",
"sqlState" : "42601",
"messageParameters" : {
"error" : "'\"not_\"\"exists\"'",
"hint" : ""
}
}


-- !query
SELECT 1 FROM `hello`
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -706,3 +706,51 @@ select -0, -0.0
-- !query analysis
Project [0 AS 0#x, 0.0 AS 0.0#x]
+- OneRowRelation


-- !query
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
-- !query analysis
Project [S"par"k AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
+- OneRowRelation


-- !query
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
-- !query analysis
Project [S'par'k AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
+- OneRowRelation


-- !query
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
-- !query analysis
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
+- OneRowRelation


-- !query
SET spark.sql.legacy.consecutiveStringLiterals.enabled=true
-- !query analysis
SetCommand (spark.sql.legacy.consecutiveStringLiterals.enabled,Some(true))


-- !query
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3
-- !query analysis
Project [Spark AS c1#x, S"par"k AS c2#x, S""par""k AS c3#x]
+- OneRowRelation


-- !query
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3
-- !query analysis
Project [Spark AS c1#x, S'par'k AS c2#x, S''par''k AS c3#x]
+- OneRowRelation


-- !query
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6
-- !query analysis
Project [Spark AS c1#x, Spark AS c2#x, Spark AS c3#x, Spark AS c4#x, Spark AS c5#x, Spark AS c6#x]
+- OneRowRelation
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- All these should error out in the parser
-- All these should error out in the parser in non-ansi mode, error out in the analyzer in ansi mode
SELECT 1 FROM "not_exist";

USE SCHEMA "not_exist";
Expand All @@ -15,6 +15,8 @@ SELECT "not_exist"();

SELECT "not_exist".not_exist();

select 1 from "not_""exists";

-- All these should error out in analysis
SELECT 1 FROM `hello`;

Expand All @@ -35,6 +37,7 @@ SELECT `not_exist`.not_exist();
-- Strings in various situations all work
SELECT "hello";

-- Ok for non-ansi mode, error for ansi-mode
CREATE TEMPORARY VIEW v(c1 COMMENT "hello") AS SELECT 1;
DROP VIEW v;

Expand Down
10 changes: 10 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/literals.sql
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,13 @@ select -x'2379ACFe';

-- normalize -0 and -0.0
select -0, -0.0;

-- Double-quote escaping ("", '')
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3;
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3;
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6;

SET spark.sql.legacy.consecutiveStringLiterals.enabled=true;
SELECT "S""par""k" AS c1, "S\"par\"k" AS c2, 'S""par""k' AS c3;
SELECT 'S''par''k' AS c1, 'S\'par\'k' AS c2, "S''par''k" AS c3;
SELECT "S" "par" "k" AS c1, 'S' 'par' 'k' AS c2, "S" 'par' "k" AS c3, 'S' "par" 'k' AS c4, "S"'par'"k" AS c5, 'S'"par"'k' AS c6;
Loading