[SPARK-31228][DSTREAMS] Add version information to the configuration of Kafka

beliefer · HyukjinKwon · commit 35d286bafb24 · 2020-03-26T20:11:15.000+09:00
### What changes were proposed in this pull request? Add version information to the configuration of Kafka. I sorted out some information show below. Item name | Since version | JIRA ID | Commit ID | Note -- | -- | -- | -- | -- spark.streaming.kafka.consumer.cache.enabled | 2.2.1 | SPARK-19185 | 02cf178#diff-c465bbcc83b2ecc7530d1c0128e4432b | spark.streaming.kafka.consumer.poll.ms | 2.0.1 | SPARK-12177 | 3134f11#diff-4597d93a0e951f7199697dba7dd0dc32 | spark.streaming.kafka.consumer.cache.initialCapacity | 2.0.1 | SPARK-12177 | 3134f11#diff-4597d93a0e951f7199697dba7dd0dc32 | spark.streaming.kafka.consumer.cache.maxCapacity | 2.0.1 | SPARK-12177 | 3134f11#diff-4597d93a0e951f7199697dba7dd0dc32 | spark.streaming.kafka.consumer.cache.loadFactor | 2.0.1 | SPARK-12177 | 3134f11#diff-4597d93a0e951f7199697dba7dd0dc32 | spark.streaming.kafka.maxRatePerPartition | 1.3.0 | SPARK-4964 | a119cae#diff-26cb4369f86050dc2e75cd16291b2844 | spark.streaming.kafka.minRatePerPartition | 2.4.0 | SPARK-25233 | 135ff16#diff-815f6ec5caf9e4beb355f5f981171f1f | spark.streaming.kafka.allowNonConsecutiveOffsets | 2.3.1 | SPARK-24067 | 1d598b7#diff-4597d93a0e951f7199697dba7dd0dc32 | spark.kafka.producer.cache.timeout | 2.2.1 | SPARK-19968 | f6730a7#diff-ac8844e8d791a75aaee3d0d10bfc1f2a | spark.kafka.producer.cache.evictorThreadRunInterval | 3.0.0 | SPARK-21869 | 7bff2db#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.consumer.cache.capacity | 3.0.0 | SPARK-27687 | efa3035#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.consumer.cache.jmx.enable | 3.0.0 | SPARK-25151 | 594c9c5#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.consumer.cache.timeout | 3.0.0 | SPARK-25151 | 594c9c5#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.consumer.cache.evictorThreadRunInterval | 3.0.0 | SPARK-25151 | 594c9c5#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.consumer.fetchedData.cache.timeout | 3.0.0 | SPARK-25151 | 594c9c5#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.consumer.fetchedData.cache.evictorThreadRunInterval | 3.0.0 | SPARK-25151 | 594c9c5#diff-ea8349d528fe8d1b0a8ffa2840ff4bcd | spark.kafka.clusters.${cluster}.auth.bootstrap.servers | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.target.bootstrap.servers.regex | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.security.protocol | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.sasl.kerberos.service.name | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.ssl.truststore.location | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.ssl.truststore.password | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.ssl.keystore.location | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.ssl.keystore.password | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.ssl.key.password | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | spark.kafka.clusters.${cluster}.sasl.token.mechanism | 3.0.0 | SPARK-27294 | 2f55809#diff-7df71bd47f5a3428ebdb05ced3c31f49 | ### Why are the changes needed? Supplemental configuration version information. ### Does this PR introduce any user-facing change? 'No'. ### How was this patch tested? Exists UT Closes apache#27989 from beliefer/add-version-to-kafka-config. Authored-by: beliefer <beliefer@163.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -2661,6 +2661,7 @@ Spark subsystems.
     <a href="streaming-kafka-0-10-integration.html">Kafka Integration guide</a>
     for more details.
   </td>
+  <td>1.3.0</td>
 </tr>
 <tr>
   <td><code>spark.streaming.kafka.minRatePerPartition</code></td>
@@ -2669,6 +2670,7 @@ Spark subsystems.
     Minimum rate (number of records per second) at which data will be read from each Kafka
     partition when using the new Kafka direct stream API.
   </td>
+  <td>2.4.0</td>
 </tr>
 <tr>
   <td><code>spark.streaming.ui.retainedBatches</code></td>
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
@@ -525,28 +525,32 @@ The caching key is built up from the following information:
 The following properties are available to configure the consumer pool:
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td>spark.kafka.consumer.cache.capacity</td>
   <td>The maximum number of consumers cached. Please note that it's a soft limit.</td>
   <td>64</td>
+  <td>3.0.0</td>
 </tr>
 <tr>
   <td>spark.kafka.consumer.cache.timeout</td>
   <td>The minimum amount of time a consumer may sit idle in the pool before it is eligible for eviction by the evictor.</td>
   <td>5m (5 minutes)</td>
+  <td>3.0.0</td>
 </tr>
 <tr>
   <td>spark.kafka.consumer.cache.evictorThreadRunInterval</td>
   <td>The interval of time between runs of the idle evictor thread for consumer pool. When non-positive, no idle evictor thread will be run.</td>
   <td>1m (1 minute)</td>
+  <td>3.0.0</td>
 </tr>
 <tr>
   <td>spark.kafka.consumer.cache.jmx.enable</td>
   <td>Enable or disable JMX for pools created with this configuration instance. Statistics of the pool are available via JMX instance.
   The prefix of JMX name is set to "kafka010-cached-simple-kafka-consumer-pool".
   </td>
   <td>false</td>
+  <td>3.0.0</td>
 </tr>
 </table>
 
@@ -571,16 +575,18 @@ Note that it doesn't leverage Apache Commons Pool due to the difference of chara
 The following properties are available to configure the fetched data pool:
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td>spark.kafka.consumer.fetchedData.cache.timeout</td>
   <td>The minimum amount of time a fetched data may sit idle in the pool before it is eligible for eviction by the evictor.</td>
   <td>5m (5 minutes)</td>
+  <td>3.0.0</td>
 </tr>
 <tr>
   <td>spark.kafka.consumer.fetchedData.cache.evictorThreadRunInterval</td>
   <td>The interval of time between runs of the idle evictor thread for fetched data pool. When non-positive, no idle evictor thread will be run.</td>
   <td>1m (1 minute)</td>
+  <td>3.0.0</td>
 </tr>
 </table>
 
@@ -816,16 +822,18 @@ It will use different Kafka producer when delegation token is renewed; Kafka pro
 The following properties are available to configure the producer pool:
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td>spark.kafka.producer.cache.timeout</td>
   <td>The minimum amount of time a producer may sit idle in the pool before it is eligible for eviction by the evictor.</td>
   <td>10m (10 minutes)</td>
+  <td>2.2.1</td>
 </tr>
 <tr>
   <td>spark.kafka.producer.cache.evictorThreadRunInterval</td>
   <td>The interval of time between runs of the idle evictor thread for producer pool. When non-positive, no idle evictor thread will be run.</td>
   <td>1m (1 minute)</td>
+  <td>3.0.0</td>
 </tr>
 </table>
 
@@ -935,14 +943,15 @@ When none of the above applies then unsecure connection assumed.
 Delegation tokens can be obtained from multiple clusters and <code>${cluster}</code> is an arbitrary unique identifier which helps to group different configurations.
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.auth.bootstrap.servers</code></td>
     <td>None</td>
     <td>
       A list of coma separated host/port pairs to use for establishing the initial connection
       to the Kafka cluster. For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.target.bootstrap.servers.regex</code></td>
@@ -953,6 +962,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       If multiple clusters match the address, an exception will be thrown and the query won't be started.
       Kafka's secure and unsecure listeners are bound to different ports. When both used the secure listener port has to be part of the regular expression.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.security.protocol</code></td>
@@ -962,6 +972,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       <code>bootstrap.servers</code> config matches (for further details please see <code>spark.kafka.clusters.${cluster}.target.bootstrap.servers.regex</code>),
       and can be overridden by setting <code>kafka.security.protocol</code> on the source or sink.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.sasl.kerberos.service.name</code></td>
@@ -970,13 +981,15 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       The Kerberos principal name that Kafka runs as. This can be defined either in Kafka's JAAS config or in Kafka's config.
       For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.ssl.truststore.location</code></td>
     <td>None</td>
     <td>
       The location of the trust store file. For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.ssl.truststore.password</code></td>
@@ -985,6 +998,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       The store password for the trust store file. This is optional and only needed if <code>spark.kafka.clusters.${cluster}.ssl.truststore.location</code> is configured.
       For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.ssl.keystore.location</code></td>
@@ -993,6 +1007,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       The location of the key store file. This is optional for client and can be used for two-way authentication for client.
       For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.ssl.keystore.password</code></td>
@@ -1001,6 +1016,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       The store password for the key store file. This is optional and only needed if <code>spark.kafka.clusters.${cluster}.ssl.keystore.location</code> is configured.
       For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.ssl.key.password</code></td>
@@ -1009,6 +1025,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       The password of the private key in the key store file. This is optional for client.
       For further details please see Kafka documentation. Only used to obtain delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
   <tr>
     <td><code>spark.kafka.clusters.${cluster}.sasl.token.mechanism</code></td>
@@ -1017,6 +1034,7 @@ Delegation tokens can be obtained from multiple clusters and <code>${cluster}</c
       SASL mechanism used for client connections with delegation token. Because SCRAM login module used for authentication a compatible mechanism has to be set here.
       For further details please see Kafka documentation (<code>sasl.mechanism</code>). Only used to authenticate against Kafka broker with delegation token.
     </td>
+    <td>3.0.0</td>
   </tr>
 </table>
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package.scala
@@ -29,26 +29,30 @@ package object kafka010 {   // scalastyle:ignore
   private[kafka010] val PRODUCER_CACHE_TIMEOUT =
     ConfigBuilder("spark.kafka.producer.cache.timeout")
       .doc("The expire time to remove the unused producers.")
+      .version("2.2.1")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("10m")
 
   private[kafka010] val PRODUCER_CACHE_EVICTOR_THREAD_RUN_INTERVAL =
     ConfigBuilder("spark.kafka.producer.cache.evictorThreadRunInterval")
       .doc("The interval of time between runs of the idle evictor thread for producer pool. " +
         "When non-positive, no idle evictor thread will be run.")
+      .version("3.0.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("1m")
 
   private[kafka010] val CONSUMER_CACHE_CAPACITY =
     ConfigBuilder("spark.kafka.consumer.cache.capacity")
       .doc("The maximum number of consumers cached. Please note it's a soft limit" +
         " (check Structured Streaming Kafka integration guide for further details).")
+      .version("3.0.0")
       .intConf
       .createWithDefault(64)
 
   private[kafka010] val CONSUMER_CACHE_JMX_ENABLED =
     ConfigBuilder("spark.kafka.consumer.cache.jmx.enable")
       .doc("Enable or disable JMX for pools created with this configuration instance.")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(false)
 
@@ -57,13 +61,15 @@ package object kafka010 {   // scalastyle:ignore
       .doc("The minimum amount of time a consumer may sit idle in the pool before " +
         "it is eligible for eviction by the evictor. " +
         "When non-positive, no consumers will be evicted from the pool due to idle time alone.")
+      .version("3.0.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("5m")
 
   private[kafka010] val CONSUMER_CACHE_EVICTOR_THREAD_RUN_INTERVAL =
     ConfigBuilder("spark.kafka.consumer.cache.evictorThreadRunInterval")
       .doc("The interval of time between runs of the idle evictor thread for consumer pool. " +
         "When non-positive, no idle evictor thread will be run.")
+      .version("3.0.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("1m")
 
@@ -72,13 +78,15 @@ package object kafka010 {   // scalastyle:ignore
       .doc("The minimum amount of time a fetched data may sit idle in the pool before " +
         "it is eligible for eviction by the evictor. " +
         "When non-positive, no fetched data will be evicted from the pool due to idle time alone.")
+      .version("3.0.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("5m")
 
   private[kafka010] val FETCHED_DATA_CACHE_EVICTOR_THREAD_RUN_INTERVAL =
     ConfigBuilder("spark.kafka.consumer.fetchedData.cache.evictorThreadRunInterval")
       .doc("The interval of time between runs of the idle evictor thread for fetched data pool. " +
         "When non-positive, no idle evictor thread will be run.")
+      .version("3.0.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("1m")
 }
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package.scala
@@ -26,43 +26,51 @@ package object kafka010 { //scalastyle:ignore
 
   private[spark] val CONSUMER_CACHE_ENABLED =
     ConfigBuilder("spark.streaming.kafka.consumer.cache.enabled")
+      .version("2.2.1")
       .booleanConf
       .createWithDefault(true)
 
   private[spark] val CONSUMER_POLL_MS =
     ConfigBuilder("spark.streaming.kafka.consumer.poll.ms")
-    .longConf
-    .createOptional
+      .version("2.0.1")
+      .longConf
+      .createOptional
 
   private[spark] val CONSUMER_CACHE_INITIAL_CAPACITY =
     ConfigBuilder("spark.streaming.kafka.consumer.cache.initialCapacity")
-    .intConf
-    .createWithDefault(16)
+      .version("2.0.1")
+      .intConf
+      .createWithDefault(16)
 
   private[spark] val CONSUMER_CACHE_MAX_CAPACITY =
     ConfigBuilder("spark.streaming.kafka.consumer.cache.maxCapacity")
-    .intConf
-    .createWithDefault(64)
+      .version("2.0.1")
+      .intConf
+      .createWithDefault(64)
 
   private[spark] val CONSUMER_CACHE_LOAD_FACTOR =
     ConfigBuilder("spark.streaming.kafka.consumer.cache.loadFactor")
-    .doubleConf
-    .createWithDefault(0.75)
+      .version("2.0.1")
+      .doubleConf
+      .createWithDefault(0.75)
 
   private[spark] val MAX_RATE_PER_PARTITION =
     ConfigBuilder("spark.streaming.kafka.maxRatePerPartition")
-    .longConf
-    .createWithDefault(0)
+      .version("1.3.0")
+      .longConf
+      .createWithDefault(0)
 
   private[spark] val MIN_RATE_PER_PARTITION =
     ConfigBuilder("spark.streaming.kafka.minRatePerPartition")
-    .longConf
-    .createWithDefault(1)
+      .version("2.4.0")
+      .longConf
+      .createWithDefault(1)
 
   private[spark] val ALLOW_NON_CONSECUTIVE_OFFSETS =
     ConfigBuilder("spark.streaming.kafka.allowNonConsecutiveOffsets")
-    .booleanConf
-    .createWithDefault(false)
+      .version("2.3.1")
+      .booleanConf
+      .createWithDefault(false)
 
 }