Skip to content
This repository was archived by the owner on Jun 30, 2022. It is now read-only.

Commit 06dc6bf

Browse files
robertwbsilviulica
authored andcommitted
Implement and use WindowedValue.with_value
This allows fewer operations to care about the internal implementation details of WindowedValue (which will get more complex over time when we add details like PaneInfo and retractions). Also, we spend a significant amount of time creating WindowedValue objects, refactoring in this way will allow us to nearly eliminate this with a fast Cython implementation. ----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=118702433
1 parent 345228a commit 06dc6bf

File tree

6 files changed

+52
-58
lines changed

6 files changed

+52
-58
lines changed

google/cloud/dataflow/runners/common.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,7 @@ def _process_outputs(self, element, results):
145145
result.value, result.timestamp,
146146
self.window_fn.assign(assign_context))
147147
else:
148-
windowed_value = WindowedValue(
149-
result, element.timestamp, element.windows)
148+
windowed_value = element.with_value(result)
150149
if tag is None:
151150
self.main_receivers.output(windowed_value)
152151
else:

google/cloud/dataflow/transforms/core.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -972,16 +972,14 @@ def process(self, context):
972972
driver = create_trigger_driver(self.windowing, True)
973973
state = InMemoryUnmergedState()
974974
# TODO(robertwb): Conditionally process in smaller chunks.
975-
for out_window, values, timestamp in (
976-
driver.process_elements(state, vs, MIN_TIMESTAMP)):
977-
yield window.WindowedValue((k, values), timestamp, [out_window])
975+
for wvalue in driver.process_elements(state, vs, MIN_TIMESTAMP):
976+
yield wvalue.with_value((k, wvalue.value))
978977
while state.timers:
979978
fired = state.get_and_clear_timers()
980979
for timer_window, (name, time_domain, fire_time) in fired:
981-
for out_window, values, timestamp in driver.process_timer(
980+
for wvalue in driver.process_timer(
982981
timer_window, name, time_domain, fire_time, state):
983-
yield window.WindowedValue(
984-
(k, values), out_window.end, [out_window])
982+
yield wvalue.with_value((k, wvalue.value))
985983

986984
def apply(self, pcoll):
987985
# This code path is only used in the local direct runner. For Dataflow

google/cloud/dataflow/transforms/trigger.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from google.cloud.dataflow.transforms.timeutil import TimeDomain
3030
from google.cloud.dataflow.transforms.window import GlobalWindow
3131
from google.cloud.dataflow.transforms.window import OutputTimeFn
32+
from google.cloud.dataflow.transforms.window import WindowedValue
3233
from google.cloud.dataflow.transforms.window import WindowFn
3334

3435

@@ -711,6 +712,7 @@ def process_timer(self, window_id, name, time_domain, timestamp, state):
711712
class DefaultGlobalBatchTriggerDriver(TriggerDriver):
712713
"""Breaks a bundles into window (pane)s according to the default triggering.
713714
"""
715+
GLOBAL_WINDOW_TUPLE = (GlobalWindow(),)
714716

715717
def __init__(self):
716718
pass
@@ -725,7 +727,7 @@ def __iter__(self):
725727
def __repr__(self):
726728
return '<UnwindowedValues of %s>' % windowed_values
727729
unwindowed = UnwindowedValues()
728-
yield GlobalWindow(), unwindowed, MIN_TIMESTAMP
730+
yield WindowedValue(unwindowed, MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE)
729731

730732
def process_timer(self, window_id, name, time_domain, timestamp, state):
731733
raise TypeError('Triggers never set or called for batch default windowing.')
@@ -741,14 +743,14 @@ def __init__(self, phased_combine_fn, underlying):
741743
def process_elements(self, state, windowed_values, output_watermark):
742744
uncombined = self.underlying.process_elements(state, windowed_values,
743745
output_watermark)
744-
for window, unwindowed, timestamp in uncombined:
745-
yield window, self.phased_combine_fn.apply(unwindowed), timestamp
746+
for output in uncombined:
747+
yield output.with_value(self.phased_combine_fn.apply(output.value))
746748

747749
def process_timer(self, window_id, name, time_domain, timestamp, state):
748750
uncombined = self.underlying.process_timer(window_id, name, time_domain,
749751
timestamp, state)
750-
for window, unwindowed in uncombined:
751-
yield window, self.phased_combine_fn.apply(unwindowed)
752+
for output in uncombined:
753+
yield output.with_value(self.phased_combine_fn.apply(output.value))
752754

753755

754756
class GeneralTriggerDriver(TriggerDriver):
@@ -870,7 +872,7 @@ def _output(self, window, finished, state):
870872
else:
871873
state.clear_state(window, self.WATERMARK_HOLD)
872874

873-
return window, values, timestamp
875+
return WindowedValue(values, timestamp, (window,))
874876

875877

876878
class InMemoryUnmergedState(UnmergedState):

google/cloud/dataflow/transforms/trigger_test.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -95,28 +95,30 @@ def run_trigger(self, window_fn, trigger_fn, accumulation_mode,
9595
state = InMemoryUnmergedState()
9696

9797
for bundle in bundles:
98-
output = driver.process_elements(state, bundle, MIN_TIMESTAMP)
99-
for out_window, values, unused_timestamp in output:
100-
actual_panes[out_window].append(set(values))
98+
for wvalue in driver.process_elements(state, bundle, MIN_TIMESTAMP):
99+
window, = wvalue.windows
100+
actual_panes[window].append(set(wvalue.value))
101101

102102
while state.timers:
103103
for timer_window, (name, time_domain, timestamp) in (
104104
state.get_and_clear_timers()):
105-
for out_window, values, unused_timestamp in driver.process_timer(
105+
for wvalue in driver.process_timer(
106106
timer_window, name, time_domain, timestamp, state):
107-
actual_panes[out_window].append(set(values))
107+
window, = wvalue.windows
108+
actual_panes[window].append(set(wvalue.value))
108109

109110
for bundle in late_bundles:
110-
output = driver.process_elements(state, bundle, MIN_TIMESTAMP)
111-
for out_window, values, unused_timestamp in output:
112-
actual_panes[out_window].append(set(values))
111+
for wvalue in driver.process_elements(state, bundle, MIN_TIMESTAMP):
112+
window, = wvalue.windows
113+
actual_panes[window].append(set(wvalue.value))
113114

114115
while state.timers:
115116
for timer_window, (name, time_domain, timestamp) in (
116117
state.get_and_clear_timers()):
117-
for out_window, values, unused_timestamp in driver.process_timer(
118+
for wvalue in driver.process_timer(
118119
timer_window, name, time_domain, timestamp, state):
119-
actual_panes[out_window].append(set(values))
120+
window, = wvalue.windows
121+
actual_panes[window].append(set(wvalue.value))
120122

121123
self.assertEqual(expected_panes, actual_panes)
122124

@@ -500,11 +502,12 @@ def fire_timers():
500502
to_fire = state.get_and_clear_timers(watermark)
501503
while to_fire:
502504
for timer_window, (name, time_domain, t_timestamp) in to_fire:
503-
for window, values, timestamp in driver.process_timer(
505+
for wvalue in driver.process_timer(
504506
timer_window, name, time_domain, t_timestamp, state):
507+
window, = wvalue.windows
505508
output.append({'window': [window.start, window.end - 1],
506-
'values': sorted(values),
507-
'timestamp': timestamp})
509+
'values': sorted(wvalue.value),
510+
'timestamp': wvalue.timestamp})
508511
to_fire = state.get_and_clear_timers(watermark)
509512

510513
for line in spec['transcript']:
@@ -520,10 +523,11 @@ def fire_timers():
520523
bundle = [
521524
WindowedValue(t, t, window_fn.assign(WindowFn.AssignContext(t, t)))
522525
for t in params]
523-
output = [{'window': [window.start, window.end - 1],
524-
'values': sorted(values),
525-
'timestamp': timestamp}
526-
for window, values, timestamp
526+
output = [{'window': [wvalue.windows[0].start,
527+
wvalue.windows[0].end - 1],
528+
'values': sorted(wvalue.value),
529+
'timestamp': wvalue.timestamp}
530+
for wvalue
527531
in driver.process_elements(state, bundle, watermark)]
528532
fire_timers()
529533

google/cloud/dataflow/transforms/window.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ def __eq__(self, other):
210210
and self.timestamp == other.timestamp
211211
and self.windows == other.windows)
212212

213+
def with_value(self, new_value):
214+
return WindowedValue(new_value, self.timestamp, self.windows)
215+
213216

214217
class TimestampedValue(object):
215218
"""A timestamped value having a value and a timestamp.

google/cloud/dataflow/worker/executor.py

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import google.cloud.dataflow.transforms as ptransform
3030
from google.cloud.dataflow.transforms import combiners
3131
from google.cloud.dataflow.transforms import trigger
32-
from google.cloud.dataflow.transforms import window
3332
from google.cloud.dataflow.transforms.combiners import curry_combine_fn
3433
from google.cloud.dataflow.transforms.combiners import PhasedCombineFnExecutor
3534
from google.cloud.dataflow.transforms.trigger import InMemoryUnmergedState
@@ -501,9 +500,8 @@ def process(self, o):
501500
logging.debug('Processing [%s] in %s', o, self)
502501
assert isinstance(o, WindowedValue)
503502
key, values = o.value
504-
windowed_value = WindowedValue(
505-
(key, self.phased_combine_fn.apply(values)), o.timestamp, o.windows)
506-
self.output(windowed_value)
503+
self.output(
504+
o.with_value((key, self.phased_combine_fn.apply(values))))
507505

508506

509507
def create_pgbk_op(spec):
@@ -633,10 +631,7 @@ def process(self, o):
633631
logging.debug('Processing [%s] in %s', o, self)
634632
assert isinstance(o, WindowedValue)
635633
k, v = o.value
636-
self.output(
637-
window.WindowedValue(
638-
(k, window.WindowedValue(v, o.timestamp, o.windows)),
639-
o.timestamp, o.windows))
634+
self.output(o.with_value((k, o.with_value(v))))
640635

641636

642637
class BatchGroupAlsoByWindowsOperation(Operation):
@@ -669,19 +664,15 @@ def process(self, o):
669664
state = InMemoryUnmergedState()
670665

671666
# TODO(robertwb): Process in smaller chunks.
672-
for out_window, values, timestamp in (
673-
driver.process_elements(state, vs, MIN_TIMESTAMP)):
674-
self.output(
675-
window.WindowedValue((k, values), timestamp, [out_window]))
667+
for wvalue in driver.process_elements(state, vs, MIN_TIMESTAMP):
668+
self.output(wvalue.with_value((k, wvalue.value)))
676669

677670
while state.timers:
678671
timers = state.get_and_clear_timers()
679672
for timer_window, (name, time_domain, timestamp) in timers:
680-
for out_window, values, timestamp in (
681-
driver.process_timer(timer_window, name, time_domain, timestamp,
682-
state)):
683-
self.output(
684-
window.WindowedValue((k, values), timestamp, [out_window]))
673+
for wvalue in driver.process_timer(
674+
timer_window, name, time_domain, timestamp, state):
675+
self.output(wvalue.with_value((k, wvalue.value)))
685676

686677

687678
class StreamingGroupAlsoByWindowsOperation(Operation):
@@ -703,19 +694,16 @@ def process(self, o):
703694
state = self.spec.context.state
704695
output_watermark = self.spec.context.output_data_watermark
705696

706-
for out_window, values, timestamp in (
707-
driver.process_elements(state, keyed_work.elements(),
708-
output_watermark)):
709-
self.output(window.WindowedValue((keyed_work.key, values), timestamp,
710-
[out_window]))
697+
key = keyed_work.key
698+
for wvalue in driver.process_elements(
699+
state, keyed_work.elements(), output_watermark):
700+
self.output(wvalue.with_value((key, wvalue.value)))
711701

712702
for timer in keyed_work.timers():
713703
timer_window = int(timer.namespace)
714-
for out_window, values, timestamp in (
715-
driver.process_timer(timer_window, timer.name, timer.time_domain,
716-
timer.timestamp, state)):
717-
self.output(window.WindowedValue((keyed_work.key, values), timestamp,
718-
[out_window]))
704+
for wvalue in driver.process_timer(
705+
timer_window, timer.name, timer.time_domain, timer.timestamp, state):
706+
self.output(wvalue.with_value((key, wvalue.value)))
719707

720708

721709
class MapTaskExecutor(object):

0 commit comments

Comments
 (0)