Skip to content
Open
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ statement
| SHOW TABLE EXTENDED ((FROM | IN) ns=identifierReference)?
LIKE pattern=stringLit partitionSpec? #showTableExtended
| SHOW TBLPROPERTIES table=identifierReference
(LEFT_PAREN key=propertyKey RIGHT_PAREN)? #showTblProperties
(LEFT_PAREN key=propertyKeyOrStringLit RIGHT_PAREN)? #showTblProperties
| SHOW COLUMNS (FROM | IN) table=identifierReference
((FROM | IN) ns=multipartIdentifier)? #showColumns
| SHOW VIEWS ((FROM | IN) identifierReference)?
Expand Down Expand Up @@ -602,14 +602,24 @@ propertyList
;

property
: key=propertyKey (EQ? value=propertyValue)?
: key=propertyKeyOrStringLit EQ value=propertyValue #propertyWithKeyAndEquals
| key=propertyKeyOrStringLitNoCoalesce value=propertyValue? #propertyWithKeyNoEquals
;

propertyKey
: errorCapturingIdentifier (DOT errorCapturingIdentifier)*
;

propertyKeyOrStringLit
: propertyKey
| stringLit
;

propertyKeyOrStringLitNoCoalesce
: propertyKey
| singleStringLit
;

propertyValue
: INTEGER_VALUE
| DECIMAL_VALUE
Expand All @@ -622,7 +632,8 @@ expressionPropertyList
;

expressionProperty
: key=propertyKey (EQ? value=expression)?
: key=propertyKeyOrStringLit EQ value=expression #expressionPropertyWithKeyAndEquals
| key=propertyKeyOrStringLitNoCoalesce value=expression #expressionPropertyWithKeyNoEquals
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
| key=propertyKeyOrStringLitNoCoalesce value=expression #expressionPropertyWithKeyNoEquals
| key=propertyKeyOrStringLitNoCoalesce value=expression? #expressionPropertyWithKeyNoEquals

;

constantList
Expand Down Expand Up @@ -1266,10 +1277,10 @@ constant
| QUESTION #posParameterLiteral
| namedParameterMarker #namedParameterLiteral
| interval #intervalLiteral
| literalType stringLitWithoutMarker #typeConstructor
| literalType singleStringLitWithoutMarker #typeConstructor
| number #numericLiteral
| booleanValue #booleanLiteral
| stringLit+ #stringLiteral
| stringLit #stringLiteral
;

namedParameterMarker
Expand Down Expand Up @@ -1682,19 +1693,32 @@ alterColumnAction
| dropDefault=DROP DEFAULT
;

// Matches one or more string literals (single or double-quoted) without parameter markers.
// Multiple consecutive literals are coalesced into a single string.
stringLitWithoutMarker
: STRING_LITERAL #stringLiteralValue
| {!double_quoted_identifiers}? DOUBLEQUOTED_STRING #doubleQuotedStringLiteralValue
;
: singleStringLitWithoutMarker+
;

// Matches exactly one string literal without coalescing or parameter markers.
// Used in type constructors where coalescing is not allowed.
singleStringLitWithoutMarker
: STRING_LITERAL #singleStringLiteralValue
| {!double_quoted_identifiers}? DOUBLEQUOTED_STRING #singleDoubleQuotedStringLiteralValue
;

// Matches one string literal or parameter marker (no coalescing).
singleStringLit
: singleStringLitWithoutMarker
| parameterMarker
;

parameterMarker
: {parameter_substitution_enabled}? namedParameterMarker #namedParameterMarkerRule
| {parameter_substitution_enabled}? QUESTION #positionalParameterMarkerRule
;

stringLit
: stringLitWithoutMarker #stringLiteralInContext
| parameterMarker #parameterStringValue
: singleStringLit+
;

comment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ import java.util.Locale

import scala.jdk.CollectionConverters._

import org.antlr.v4.runtime.Token
import org.antlr.v4.runtime.{CharStream, CommonToken, Token, TokenSource}
import org.antlr.v4.runtime.misc.Pair
import org.antlr.v4.runtime.tree.ParseTree

import org.apache.spark.SparkException
Expand All @@ -45,18 +46,188 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
}

override def visitStringLiteralValue(ctx: StringLiteralValueContext): Token =
Option(ctx).map(_.STRING_LITERAL.getSymbol).orNull
/**
* Visits a stringLit context that may contain multiple singleStringLit children (which can be
* either singleStringLitWithoutMarker or parameterMarker). When multiple children are present,
* they are coalesced into a single token.
*/
override def visitStringLit(ctx: StringLitContext): Token = {
if (ctx == null) {
return null
}

import scala.jdk.CollectionConverters._

// Collect tokens from all singleStringLit children.
// Each child is either a singleStringLitWithoutMarker or a parameterMarker.
val tokens = ctx
.singleStringLit()
.asScala
.map { child =>
visit(child).asInstanceOf[Token]
}
.toSeq

if (tokens.isEmpty) {
null
} else if (tokens.size == 1) {
// Fast path: single token, return unchanged
tokens.head
} else {
// Multiple tokens: create coalesced token
createCoalescedStringToken(tokens)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm shall we put everything in AstBuilder? Looks weird to combine tokens here.

}
}

/**
* Visits a stringLitWithoutMarker context that contains one or more string literal terminals.
* Multiple literals are automatically coalesced into a single CoalescedStringToken.
*/
override def visitStringLitWithoutMarker(ctx: StringLitWithoutMarkerContext): Token = {
if (ctx == null) {
return null
}

// Collect all string literal terminals (could be multiple with stringLitWithoutMarker+)
val allTerminals = collectStringTerminals(ctx)

override def visitDoubleQuotedStringLiteralValue(
ctx: DoubleQuotedStringLiteralValueContext): Token =
Option(ctx).map(_.DOUBLEQUOTED_STRING.getSymbol).orNull
if (allTerminals.isEmpty) {
null
} else if (allTerminals.size == 1) {
// Fast path: single literal, return original token unchanged
allTerminals.head.getSymbol
} else {
// Multiple literals: create coalesced token
createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
}
}

/**
* Visits singleStringLitWithoutMarker alternatives and returns the token. Always returns
* exactly one token without coalescing.
*/
override def visitSingleStringLiteralValue(ctx: SingleStringLiteralValueContext): Token = {
ctx.STRING_LITERAL().getSymbol
}

override def visitSingleDoubleQuotedStringLiteralValue(
ctx: SingleDoubleQuotedStringLiteralValueContext): Token = {
ctx.DOUBLEQUOTED_STRING().getSymbol
}

/**
* Visits an integerVal alternative and returns the INTEGER_VALUE token.
*
* @param ctx
* The integerVal context to process.
* @return
* The INTEGER_VALUE token, or null if context is null.
*/
override def visitIntegerVal(ctx: IntegerValContext): Token =
Option(ctx).map(_.INTEGER_VALUE.getSymbol).orNull

override def visitStringLiteralInContext(ctx: StringLiteralInContextContext): Token = {
visit(ctx.stringLitWithoutMarker).asInstanceOf[Token]
/**
* Collects all string literal terminals from a stringLitWithoutMarker context. The grammar rule
* allows one or more consecutive string literals, which are collected in source order for
* coalescing.
*
* @param ctx
* The stringLitWithoutMarker context to process.
* @return
* A sequence of terminal nodes representing the string literals.
*/
private def collectStringTerminals(
ctx: StringLitWithoutMarkerContext): Seq[org.antlr.v4.runtime.tree.TerminalNode] = {
// With the grammar change to singleStringLitWithoutMarker+, we visit each child context.
// Each singleStringLitWithoutMarker has labeled alternatives that we need to handle.
import scala.jdk.CollectionConverters._
ctx
.singleStringLitWithoutMarker()
.asScala
.map { child =>
// Visit the child to get its token (handled by visitSingleStringLiteralValue or
// visitSingleDoubleQuotedStringLiteralValue)
val token = visit(child).asInstanceOf[Token]
// Get the terminal node from the parse tree
child.getChild(0).asInstanceOf[org.antlr.v4.runtime.tree.TerminalNode]
}
.toSeq
}

/**
* Creates a CoalescedStringToken from multiple string literal tokens.
*
* This method concatenates the raw content of the tokens (with outer quotes removed but escape
* sequences preserved). The resulting token preserves R-string status and quote character type
* from the original tokens.
*
* @param tokens
* A sequence of tokens to coalesce (must be non-empty).
* @return
* A CoalescedStringToken representing the concatenated value.
*/
private def createCoalescedStringToken(tokens: Seq[Token]): Token = {
val firstToken = tokens.head
val lastToken = tokens.last

// Check if any of the tokens are R-strings.
val hasRString = tokens.exists { token =>
val text = token.getText
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks duplicated with L187-190 and L207-210 below, let's dedup into a helper? Bonus point for adding a bunch of unit tests specifically for the helper with many different input strings (AI can maybe help generate a bunch of test cases).

text.length >= 2 &&
(text.charAt(0) == 'R' || text.charAt(0) == 'r') &&
(text.charAt(1) == '\'' || text.charAt(1) == '"')
}

// Determines the quote character for the coalesced token by finding the first
// non-R-string token and extracting its quote type. This preserves the original
// quotation style ('single' or "double") in the coalesced result.
val quoteChar = {
val firstNonRToken = tokens
.find { token =>
val text = token.getText
!(text.length >= 2 &&
(text.charAt(0) == 'R' || text.charAt(0) == 'r') &&
(text.charAt(1) == '\'' || text.charAt(1) == '"'))
}
.getOrElse(tokens.head)

val text = firstNonRToken.getText
if (text.startsWith("\"") || (text.length >= 2 && text.charAt(1) == '"')) {
'"'
} else {
'\''
}
}

// Concatenate the raw content of each token (without the outer quotes).
// Preserve all inner content including "" or '' sequences. These will be
// handled later by unescapeSQLString based on the configuration.
val coalescedRawContent = tokens.map { token =>
val text = token.getText
// Check if this is an R-string (raw string literal).
val isRString = text.length >= 2 &&
(text.charAt(0) == 'R' || text.charAt(0) == 'r') &&
(text.charAt(1) == '\'' || text.charAt(1) == '"')

if (isRString) {
// For R-strings: Remove R prefix and outer quotes (first 2 chars and last char).
text.substring(2, text.length - 1)
} else {
// For regular strings: Remove only the outer quotes (first and last character).
// Keep all inner content including "" or '' which will be processed by unescapeSQLString.
text.substring(1, text.length - 1)
}
}.mkString

new CoalescedStringToken(
new org.antlr.v4.runtime.misc.Pair(firstToken.getTokenSource, firstToken.getInputStream),
firstToken.getType,
firstToken.getChannel,
firstToken.getStartIndex,
lastToken.getStopIndex,
coalescedRawContent,
hasRString,
quoteChar)
}

override def visitNamedParameterMarkerRule(ctx: NamedParameterMarkerRuleContext): Token = {
Expand Down Expand Up @@ -99,12 +270,6 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
ctx.getText.toInt
}

/**
* Visit a stringLit context by delegating to the appropriate labeled visitor.
*/
def visitStringLit(ctx: StringLitContext): Token =
Option(ctx).map(visit(_).asInstanceOf[Token]).orNull

/**
* Create a multi-part identifier.
*/
Expand Down Expand Up @@ -399,3 +564,54 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
(start.getOrElse(defaultStart), step.getOrElse(defaultStep))
}
}

/**
* A synthetic token representing multiple coalesced string literals.
*
* When the parser encounters consecutive string literals (e.g., 'hello' 'world'), they are
* automatically coalesced into a single logical string. This token class represents such
* coalesced strings while maintaining the Token interface.
*
* The coalescedValue contains the raw concatenated content from all the string literals (with
* outer quotes removed but escape sequences preserved). The getText() method wraps this in
* quotes, and when SparkParserUtils.string() is called, it will unescape the content based on the
* current SQL configuration (respecting ESCAPED_STRING_LITERALS).
*
* @param source
* The token source and input stream
* @param tokenType
* The ANTLR token type (typically STRING_LITERAL)
* @param channel
* The token channel
* @param start
* The start index of the first literal in the input stream
* @param stop
* The stop index of the last literal in the input stream
* @param coalescedValue
* The raw concatenated content (without outer quotes, escape sequences NOT processed)
*/
private[parser] class CoalescedStringToken(
source: Pair[TokenSource, CharStream],
tokenType: Int,
channel: Int,
start: Int,
stop: Int,
private val coalescedValue: String,
private val isRawString: Boolean = false,
private val quoteChar: Char = '\'')
extends CommonToken(source, tokenType, channel, start, stop) {

override def getText: String = {
if (isRawString) {
// Preserve R-string prefix so unescapeSQLString knows not to process escapes
s"R$quoteChar$coalescedValue$quoteChar"
} else {
s"$quoteChar$coalescedValue$quoteChar"
}
}

// Returns the same text as getText() to maintain transparency of coalescing.
// This ensures that debug output, error messages, and logging show the
// actual SQL string literal rather than a debug representation.
override def toString: String = getText
}
Loading