21
21
from copy import deepcopy
22
22
from io import BytesIO
23
23
from json import loads
24
- from struct import pack , unpack
25
24
from typing import Dict , Union , Optional , Set , Callable
26
25
27
26
from fastavro import (schemaless_reader ,
30
29
validate )
31
30
from fastavro .schema import load_schema
32
31
33
- from . import (_MAGIC_BYTE ,
34
- Schema ,
32
+ from . import (Schema ,
35
33
topic_subject_name_strategy ,
36
34
RuleMode ,
37
- RuleKind , SchemaRegistryClient )
38
- from confluent_kafka . serialization import ( SerializationError ,
39
- SerializationContext )
35
+ RuleKind , SchemaRegistryClient , prefix_schema_id_serializer ,
36
+ dual_schema_id_deserializer )
37
+ from confluent_kafka . serialization import ( SerializationContext )
40
38
from .rule_registry import RuleRegistry
41
39
from .serde import BaseSerializer , BaseDeserializer , RuleContext , FieldType , \
42
- FieldTransform , RuleConditionError , ParsedSchemaCache
40
+ FieldTransform , RuleConditionError , ParsedSchemaCache , SchemaId
41
+
42
+
43
+ AVRO_TYPE = "AVRO"
43
44
44
45
45
46
AvroMessage = Union [
@@ -164,6 +165,12 @@ class AvroSerializer(BaseSerializer):
164
165
| | | |
165
166
| | | Defaults to topic_subject_name_strategy. |
166
167
+-----------------------------+----------+--------------------------------------------------+
168
+ | | | Callable(bytes, SerializationContext, schema_id) |
169
+ | | | -> bytes |
170
+ | | | |
171
+ | ``schema.id.serializer`` | callable | Defines how the schema id/guid is serialized. |
172
+ | | | Defaults to prefix_schema_id_serializer. |
173
+ +-----------------------------+----------+--------------------------------------------------+
167
174
168
175
Schemas are registered against subject names in Confluent Schema Registry that
169
176
define a scope in which the schemas can be evolved. By default, the subject name
@@ -223,7 +230,8 @@ class AvroSerializer(BaseSerializer):
223
230
'use.schema.id' : None ,
224
231
'use.latest.version' : False ,
225
232
'use.latest.with.metadata' : None ,
226
- 'subject.name.strategy' : topic_subject_name_strategy }
233
+ 'subject.name.strategy' : topic_subject_name_strategy ,
234
+ 'schema.id.serializer' : prefix_schema_id_serializer }
227
235
228
236
def __init__ (
229
237
self ,
@@ -286,6 +294,10 @@ def __init__(
286
294
if not callable (self ._subject_name_func ):
287
295
raise ValueError ("subject.name.strategy must be callable" )
288
296
297
+ self ._schema_id_serializer = conf_copy .pop ('schema.id.serializer' )
298
+ if not callable (self ._schema_id_serializer ):
299
+ raise ValueError ("schema.id.serializer must be callable" )
300
+
289
301
if len (conf_copy ) > 0 :
290
302
raise ValueError ("Unrecognized properties: {}"
291
303
.format (", " .join (conf_copy .keys ())))
@@ -345,19 +357,20 @@ def __call__(self, obj: object, ctx: Optional[SerializationContext] = None) -> O
345
357
subject = self ._subject_name_func (ctx , self ._schema_name )
346
358
latest_schema = self ._get_reader_schema (subject )
347
359
if latest_schema is not None :
348
- self ._schema_id = latest_schema .schema_id
360
+ self ._schema_id = SchemaId ( AVRO_TYPE , latest_schema .schema_id , latest_schema . guid )
349
361
elif subject not in self ._known_subjects :
350
362
# Check to ensure this schema has been registered under subject_name.
351
363
if self ._auto_register :
352
364
# The schema name will always be the same. We can't however register
353
365
# a schema without a subject so we set the schema_id here to handle
354
366
# the initial registration.
355
- self . _schema_id = self ._registry .register_schema (
367
+ registered_schema = self ._registry .register_schema_full_response (
356
368
subject , self ._schema , self ._normalize_schemas )
369
+ self ._schema_id = SchemaId (AVRO_TYPE , registered_schema .schema_id , registered_schema .guid )
357
370
else :
358
371
registered_schema = self ._registry .lookup_schema (
359
372
subject , self ._schema , self ._normalize_schemas )
360
- self ._schema_id = registered_schema .schema_id
373
+ self ._schema_id = SchemaId ( AVRO_TYPE , registered_schema .schema_id , registered_schema . guid )
361
374
362
375
self ._known_subjects .add (subject )
363
376
@@ -377,12 +390,9 @@ def __call__(self, obj: object, ctx: Optional[SerializationContext] = None) -> O
377
390
parsed_schema = self ._parsed_schema
378
391
379
392
with _ContextStringIO () as fo :
380
- # Write the magic byte and schema ID in network byte order (big endian)
381
- fo .write (pack ('>bI' , _MAGIC_BYTE , self ._schema_id ))
382
393
# write the record to the rest of the buffer
383
394
schemaless_writer (fo , parsed_schema , value )
384
-
385
- return fo .getvalue ()
395
+ return self ._schema_id_serializer (fo .getvalue (), ctx , self ._schema_id )
386
396
387
397
def _get_parsed_schema (self , schema : Schema ) -> AvroSchema :
388
398
parsed_schema = self ._parsed_schemas .get_parsed_schema (schema )
@@ -425,6 +435,12 @@ class AvroDeserializer(BaseDeserializer):
425
435
| | | |
426
436
| | | Defaults to topic_subject_name_strategy. |
427
437
+-----------------------------+----------+--------------------------------------------------+
438
+ | | | Callable(bytes, SerializationContext, schema_id) |
439
+ | | | -> io.BytesIO |
440
+ | | | |
441
+ | ``schema.id.deserializer`` | callable | Defines how the schema id/guid is deserialized. |
442
+ | | | Defaults to dual_schema_id_deserializer. |
443
+ +-----------------------------+----------+--------------------------------------------------+
428
444
429
445
Note:
430
446
By default, Avro complex types are returned as dicts. This behavior can
@@ -462,7 +478,8 @@ class AvroDeserializer(BaseDeserializer):
462
478
463
479
_default_conf = {'use.latest.version' : False ,
464
480
'use.latest.with.metadata' : None ,
465
- 'subject.name.strategy' : topic_subject_name_strategy }
481
+ 'subject.name.strategy' : topic_subject_name_strategy ,
482
+ 'schema.id.deserializer' : dual_schema_id_deserializer }
466
483
467
484
def __init__ (
468
485
self ,
@@ -507,6 +524,10 @@ def __init__(
507
524
if not callable (self ._subject_name_func ):
508
525
raise ValueError ("subject.name.strategy must be callable" )
509
526
527
+ self ._schema_id_deserializer = conf_copy .pop ('schema.id.deserializer' )
528
+ if not callable (self ._schema_id_deserializer ):
529
+ raise ValueError ("schema.id.deserializer must be callable" )
530
+
510
531
if len (conf_copy ) > 0 :
511
532
raise ValueError ("Unrecognized properties: {}"
512
533
.format (", " .join (conf_copy .keys ())))
@@ -551,67 +572,57 @@ def __call__(self, data: bytes, ctx: Optional[SerializationContext] = None) -> U
551
572
if data is None :
552
573
return None
553
574
554
- if len (data ) <= 5 :
555
- raise SerializationError ("Expecting data framing of length 6 bytes or "
556
- "more but total data size is {} bytes. This "
557
- "message was not produced with a Confluent "
558
- "Schema Registry serializer" .format (len (data )))
559
-
560
575
subject = self ._subject_name_func (ctx , None ) if ctx else None
561
576
latest_schema = None
562
577
if subject is not None :
563
578
latest_schema = self ._get_reader_schema (subject )
564
579
565
- with _ContextStringIO (data ) as payload :
566
- magic , schema_id = unpack ('>bI' , payload .read (5 ))
567
- if magic != _MAGIC_BYTE :
568
- raise SerializationError ("Unexpected magic byte {}. This message "
569
- "was not produced with a Confluent "
570
- "Schema Registry serializer" .format (magic ))
571
-
572
- writer_schema_raw = self ._registry .get_schema (schema_id )
573
- writer_schema = self ._get_parsed_schema (writer_schema_raw )
574
-
575
- if subject is None :
576
- subject = self ._subject_name_func (ctx , writer_schema .get ("name" )) if ctx else None
577
- if subject is not None :
578
- latest_schema = self ._get_reader_schema (subject )
579
-
580
- if latest_schema is not None :
581
- migrations = self ._get_migrations (subject , writer_schema_raw , latest_schema , None )
582
- reader_schema_raw = latest_schema .schema
583
- reader_schema = self ._get_parsed_schema (latest_schema .schema )
584
- elif self ._schema is not None :
585
- migrations = None
586
- reader_schema_raw = self ._schema
587
- reader_schema = self ._reader_schema
588
- else :
589
- migrations = None
590
- reader_schema_raw = writer_schema_raw
591
- reader_schema = writer_schema
592
-
593
- if migrations :
594
- obj_dict = schemaless_reader (payload ,
595
- writer_schema ,
596
- None ,
597
- self ._return_record_name )
598
- obj_dict = self ._execute_migrations (ctx , subject , migrations , obj_dict )
599
- else :
600
- obj_dict = schemaless_reader (payload ,
601
- writer_schema ,
602
- reader_schema ,
603
- self ._return_record_name )
580
+ schema_id = SchemaId (AVRO_TYPE )
581
+ payload = self ._schema_id_deserializer (data , ctx , schema_id )
582
+
583
+ writer_schema_raw = self ._get_writer_schema (schema_id , subject )
584
+ writer_schema = self ._get_parsed_schema (writer_schema_raw )
585
+
586
+ if subject is None :
587
+ subject = self ._subject_name_func (ctx , writer_schema .get ("name" )) if ctx else None
588
+ if subject is not None :
589
+ latest_schema = self ._get_reader_schema (subject )
590
+
591
+ if latest_schema is not None :
592
+ migrations = self ._get_migrations (subject , writer_schema_raw , latest_schema , None )
593
+ reader_schema_raw = latest_schema .schema
594
+ reader_schema = self ._get_parsed_schema (latest_schema .schema )
595
+ elif self ._schema is not None :
596
+ migrations = None
597
+ reader_schema_raw = self ._schema
598
+ reader_schema = self ._reader_schema
599
+ else :
600
+ migrations = None
601
+ reader_schema_raw = writer_schema_raw
602
+ reader_schema = writer_schema
603
+
604
+ if migrations :
605
+ obj_dict = schemaless_reader (payload ,
606
+ writer_schema ,
607
+ None ,
608
+ self ._return_record_name )
609
+ obj_dict = self ._execute_migrations (ctx , subject , migrations , obj_dict )
610
+ else :
611
+ obj_dict = schemaless_reader (payload ,
612
+ writer_schema ,
613
+ reader_schema ,
614
+ self ._return_record_name )
604
615
605
- field_transformer = lambda rule_ctx , field_transform , message : ( # noqa: E731
606
- transform (rule_ctx , reader_schema , message , field_transform ))
607
- obj_dict = self ._execute_rules (ctx , subject , RuleMode .READ , None ,
608
- reader_schema_raw , obj_dict , get_inline_tags (reader_schema ),
609
- field_transformer )
616
+ field_transformer = lambda rule_ctx , field_transform , message : ( # noqa: E731
617
+ transform (rule_ctx , reader_schema , message , field_transform ))
618
+ obj_dict = self ._execute_rules (ctx , subject , RuleMode .READ , None ,
619
+ reader_schema_raw , obj_dict , get_inline_tags (reader_schema ),
620
+ field_transformer )
610
621
611
- if self ._from_dict is not None :
612
- return self ._from_dict (obj_dict , ctx )
622
+ if self ._from_dict is not None :
623
+ return self ._from_dict (obj_dict , ctx )
613
624
614
- return obj_dict
625
+ return obj_dict
615
626
616
627
def _get_parsed_schema (self , schema : Schema ) -> AvroSchema :
617
628
parsed_schema = self ._parsed_schemas .get_parsed_schema (schema )
0 commit comments