Skip to content

Commit 995896c

Browse files
dlange-aaiAssemblyAI
andauthored
chore: sync sdk code with DeepLearning repo (#200)
Co-authored-by: AssemblyAI <engineering.sdk@assemblyai.com>
1 parent d257f92 commit 995896c

5 files changed

Lines changed: 106 additions & 1 deletion

File tree

assemblyai/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.64.3"
1+
__version__ = "0.64.4"

assemblyai/streaming/v3/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
EventMessage,
77
LLMGatewayResponseEvent,
88
NoiseSuppressionModel,
9+
SpeakerRevisionEvent,
910
SpeechModel,
1011
SpeechStartedEvent,
1112
StreamingClientOptions,
@@ -29,6 +30,7 @@
2930
"EventMessage",
3031
"LLMGatewayResponseEvent",
3132
"NoiseSuppressionModel",
33+
"SpeakerRevisionEvent",
3234
"SpeechModel",
3335
"SpeechStartedEvent",
3436
"StreamingClient",

assemblyai/streaming/v3/_base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
ErrorEvent,
3434
EventMessage,
3535
LLMGatewayResponseEvent,
36+
SpeakerRevisionEvent,
3637
SpeechStartedEvent,
3738
StreamingClientOptions,
3839
StreamingError,
@@ -233,6 +234,8 @@ def _parse_message(cls, data: Dict[str, Any]) -> Optional[EventMessage]:
233234
return _parse_model(SpeechStartedEvent, data)
234235
elif event_type == StreamingEvents.LLMGatewayResponse:
235236
return _parse_model(LLMGatewayResponseEvent, data)
237+
elif event_type == StreamingEvents.SpeakerRevision:
238+
return _parse_model(SpeakerRevisionEvent, data)
236239
elif event_type == StreamingEvents.Error:
237240
return _parse_model(ErrorEvent, data)
238241
elif event_type == StreamingEvents.Warning:

assemblyai/streaming/v3/models.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,21 @@ class LLMGatewayResponseEvent(BaseModel):
7474
data: Any
7575

7676

77+
class SpeakerRevisionEvent(BaseModel):
78+
"""Server-side correction to a previously-emitted Turn's speaker labels.
79+
80+
Emitted after offline reclustering refines the live tentative labels.
81+
Match by `turn_order` against the original Turn; replace its per-word
82+
speaker assignments (and the turn-level `speaker_label`) with these.
83+
Text and word timestamps are unchanged from the original Turn.
84+
"""
85+
86+
type: Literal["SpeakerRevision"] = "SpeakerRevision"
87+
turn_order: int
88+
speaker_label: Optional[str] = None
89+
words: List[Word] = []
90+
91+
7792
EventMessage = Union[
7893
BeginEvent,
7994
TerminationEvent,
@@ -82,6 +97,7 @@ class LLMGatewayResponseEvent(BaseModel):
8297
ErrorEvent,
8398
WarningEvent,
8499
LLMGatewayResponseEvent,
100+
SpeakerRevisionEvent,
85101
]
86102

87103

@@ -290,3 +306,4 @@ class StreamingEvents(Enum):
290306
Error = "Error"
291307
Warning = "Warning"
292308
LLMGatewayResponse = "LLMGatewayResponse"
309+
SpeakerRevision = "SpeakerRevision"

tests/unit/test_streaming.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from assemblyai.streaming.v3 import (
1515
NoiseSuppressionModel,
16+
SpeakerRevisionEvent,
1617
SpeechModel,
1718
SpeechStartedEvent,
1819
StreamingClient,
@@ -911,6 +912,88 @@ def test_turn_event_with_word_speakers():
911912
assert [w.speaker for w in event.words] == ["A", "B"]
912913

913914

915+
def test_speaker_revision_event_parses():
916+
# Given: a SpeakerRevision payload as emitted by the server (revision words
917+
# use the same Word schema as Turn — start/end/confidence/text/word_is_final/speaker)
918+
data = {
919+
"type": "SpeakerRevision",
920+
"turn_order": 3,
921+
"speaker_label": "B",
922+
"words": [
923+
{
924+
"start": 1000,
925+
"end": 1200,
926+
"confidence": 0.9,
927+
"text": "hello",
928+
"word_is_final": True,
929+
"speaker": "B",
930+
},
931+
{
932+
"start": 1210,
933+
"end": 1400,
934+
"confidence": 0.88,
935+
"text": "world",
936+
"word_is_final": True,
937+
"speaker": "A",
938+
},
939+
],
940+
}
941+
942+
# When: parsed
943+
event = SpeakerRevisionEvent.parse_obj(data)
944+
945+
# Then: the revision carries the corrected per-word and turn-level speakers
946+
assert event.type == "SpeakerRevision"
947+
assert event.turn_order == 3
948+
assert event.speaker_label == "B"
949+
assert [w.speaker for w in event.words] == ["B", "A"]
950+
951+
952+
def test_speaker_revision_event_dispatched_to_handler(mocker: MockFixture):
953+
# Given: a SpeakerRevision frame on the wire and a handler registered
954+
revision_json = json.dumps(
955+
{
956+
"type": "SpeakerRevision",
957+
"turn_order": 5,
958+
"speaker_label": "A",
959+
"words": [
960+
{
961+
"start": 500,
962+
"end": 700,
963+
"confidence": 0.95,
964+
"text": "yes",
965+
"word_is_final": True,
966+
"speaker": "A",
967+
},
968+
],
969+
}
970+
)
971+
fake_ws = _FakeWebSocket(recv_script=[revision_json])
972+
mocker.patch(
973+
"assemblyai.streaming.v3.client.websocket_connect",
974+
return_value=fake_ws,
975+
)
976+
received = []
977+
client = StreamingClient(
978+
StreamingClientOptions(api_key="test", api_host="api.example.com")
979+
)
980+
client.on(StreamingEvents.SpeakerRevision, lambda _c, event: received.append(event))
981+
982+
# When: the client reads the frame
983+
client.connect(_default_params())
984+
deadline = time.monotonic() + 2.0
985+
while time.monotonic() < deadline and not received:
986+
time.sleep(0.02)
987+
client.disconnect(terminate=False)
988+
989+
# Then: the handler is invoked with a parsed SpeakerRevisionEvent
990+
assert len(received) == 1
991+
assert isinstance(received[0], SpeakerRevisionEvent)
992+
assert received[0].turn_order == 5
993+
assert received[0].speaker_label == "A"
994+
assert [w.speaker for w in received[0].words] == ["A"]
995+
996+
914997
def test_speech_model_required():
915998
"""Test that omitting speech_model raises a validation error."""
916999
with pytest.raises(ValidationError):

0 commit comments

Comments
 (0)