|
28 | 28 | as_tensor_variable, |
29 | 29 | cast, |
30 | 30 | constant, |
| 31 | + expand_dims, |
31 | 32 | get_underlying_scalar_constant_value, |
32 | 33 | moveaxis, |
33 | 34 | ones_like, |
34 | 35 | register_infer_shape, |
35 | 36 | switch, |
36 | 37 | zeros_like, |
37 | 38 | ) |
38 | | -from pytensor.tensor.blockwise import Blockwise |
39 | 39 | from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise |
40 | 40 | from pytensor.tensor.exceptions import NotScalarConstantError |
41 | 41 | from pytensor.tensor.extra_ops import broadcast_arrays |
|
45 | 45 | Sum, |
46 | 46 | _conj, |
47 | 47 | _dot, |
48 | | - _inner_prod, |
49 | | - _matrix_matrix_matmul, |
50 | | - _matrix_vec_prod, |
51 | | - _vec_matrix_prod, |
| 48 | + _matmul, |
52 | 49 | add, |
53 | 50 | digamma, |
54 | 51 | dot, |
@@ -197,60 +194,153 @@ def local_lift_transpose_through_dot(fgraph, node): |
197 | 194 | return ret |
198 | 195 |
|
199 | 196 |
|
200 | | -@register_stabilize |
201 | | -@register_specialize |
202 | | -@node_rewriter(tracks=[Blockwise]) |
203 | | -def local_batched_matmul_to_core_matmul(fgraph, node): |
204 | | - """Rewrite matmul where only one of the inputs has batch dimensions to a reshaped core matmul. |
| 197 | +def _batched_matmul_to_core_matmul(fgraph, node, allow_reshape: bool): |
| 198 | + """Move batch dimensions of matmul operands to core matmul |
205 | 199 |
|
206 | | - Example, if x has batch dimensions, but y not: |
| 200 | + Example, if x has batch dimensions that don't overlap with batch dimensions of y |
207 | 201 | x @ y -> (x.reshape(-1, x.shape[-1]) @ y).reshape(*x.shape[:-1], y.shape[-1]) |
208 | 202 |
|
209 | | - It also works when y has batch dimensions, but x not. |
210 | | - """ |
| 203 | + It also works for batch dimensions of y that don't overlap with batch dimensions of x |
211 | 204 |
|
212 | | - # Check whether we have a matmul operation in this node |
213 | | - if not ( |
214 | | - isinstance(node.op.core_op, Dot) |
215 | | - and len(node.op.inputs_sig[0]) == 2 |
216 | | - and len(node.op.inputs_sig[1]) == 2 |
217 | | - ): |
218 | | - return None |
| 205 | + The rewrite only uses reshape when mixing dimensions, and it can refuse to apply if `allow_reshape=False` |
| 206 | + """ |
219 | 207 |
|
220 | 208 | x, y = node.inputs |
221 | 209 | batch_ndim = node.op.batch_ndim(node) |
222 | 210 |
|
223 | | - # Check if x has batch dimensions, but y not (or only broadcastable dimensions) |
224 | | - if any(not b_dim for b_dim in x.type.broadcastable[:-2]) and all( |
225 | | - y.type.broadcastable[:-2] |
226 | | - ): |
227 | | - x_stacked = x.reshape((-1, x.shape[-1])) |
228 | | - out_stacked = x_stacked @ y.squeeze(tuple(range(batch_ndim))) |
229 | | - out = out_stacked.reshape((*x.shape[:-1], y.shape[-1])) |
230 | | - return [out] |
231 | | - |
232 | | - # Otherwise, check if y has batch dimension, but x not |
233 | | - elif any(not b_dim for b_dim in y.type.broadcastable[:-2]) and all( |
234 | | - x.type.broadcastable[:-2] |
235 | | - ): |
236 | | - # For the y batch case we need to first move the batch axes and then reshape |
237 | | - # y.shape == (*b, k, n) |
238 | | - y_tr = moveaxis(y, -2, 0) # (k, *b, n) |
239 | | - y_stacked = y_tr.reshape((y.shape[-2], -1)) # (k, *b * n) |
240 | | - out_stacked = x.squeeze(tuple(range(batch_ndim))) @ y_stacked # (m, *b * n) |
241 | | - out_stacked_tr = out_stacked.reshape( |
242 | | - (x.shape[-2], *y.shape[:-2], y.shape[-1]) |
243 | | - ) # (m, *b, n) |
244 | | - out = moveaxis(out_stacked_tr, 0, -2) # (*b, m, n) |
245 | | - return [out] |
246 | | - |
247 | | - # Both x and y have batch dimensions, nothing to do here |
248 | | - return None |
| 211 | + x_axis_to_merge = [ |
| 212 | + i |
| 213 | + for i, (bcast_x, bcast_y) in enumerate( |
| 214 | + zip(x.type.broadcastable[:-2], y.type.broadcastable[:-2]) |
| 215 | + ) |
| 216 | + if bcast_y and not bcast_x |
| 217 | + ] |
| 218 | + |
| 219 | + y_axis_to_merge = [ |
| 220 | + i |
| 221 | + for i, (bcast_x, bcast_y) in enumerate( |
| 222 | + zip(x.type.broadcastable[:-2], y.type.broadcastable[:-2]) |
| 223 | + ) |
| 224 | + if bcast_x and not bcast_y |
| 225 | + ] |
| 226 | + |
| 227 | + if not (x_axis_to_merge or y_axis_to_merge): |
| 228 | + return None |
| 229 | + |
| 230 | + x_shape = tuple(x.shape) |
| 231 | + y_shape = tuple(y.shape) |
| 232 | + x_is_row = x.type.broadcastable[-2] |
| 233 | + y_is_col = y.type.broadcastable[-1] |
| 234 | + n_x_axis_to_merge = len(x_axis_to_merge) |
| 235 | + n_y_axis_to_merge = len(y_axis_to_merge) |
| 236 | + n_axis_to_merge = n_x_axis_to_merge + n_y_axis_to_merge |
| 237 | + |
| 238 | + x_stacked, y_stacked = x, y |
| 239 | + dims_were_merged = False |
| 240 | + |
| 241 | + if n_x_axis_to_merge: |
| 242 | + # ravel batch dimensions of x on the core (m) axis |
| 243 | + x_axis_destination = tuple(range(-n_x_axis_to_merge - 2, -2)) |
| 244 | + x_stacked = moveaxis(x, x_axis_to_merge, x_axis_destination) |
| 245 | + if x_is_row: |
| 246 | + # x was a row matrix, squeeze it to clean up the graph |
| 247 | + x_stacked = x_stacked.squeeze(-2) |
| 248 | + if n_x_axis_to_merge > 1 or not x_is_row: |
| 249 | + if not allow_reshape: |
| 250 | + return None |
| 251 | + |
| 252 | + # Ravel moved batch dims together with (m) if needed |
| 253 | + x_stacked_shape = tuple(x_stacked.shape) |
| 254 | + x_stacked = x_stacked.reshape( |
| 255 | + (*x_stacked_shape[: batch_ndim - n_x_axis_to_merge], -1, x_shape[-1]) |
| 256 | + ) |
| 257 | + dims_were_merged = True |
| 258 | + |
| 259 | + if n_y_axis_to_merge: |
| 260 | + # ravel batch dimensions of y on the core (n) axis |
| 261 | + y_axis_destination = tuple(range(-n_y_axis_to_merge - 1, -1)) |
| 262 | + y_stacked = moveaxis(y, y_axis_to_merge, y_axis_destination) |
| 263 | + if y_is_col: |
| 264 | + # y was a column matrix, squeeze it to clean up the graph |
| 265 | + y_stacked = y_stacked.squeeze(-1) |
| 266 | + if n_y_axis_to_merge > 1 or not y_is_col: |
| 267 | + if not allow_reshape: |
| 268 | + return False |
| 269 | + # Ravel moved batch dims together with (n) if needed |
| 270 | + y_stacked_shape = tuple(y_stacked.shape) |
| 271 | + y_stacked = y_stacked.reshape( |
| 272 | + (*y_stacked_shape[: batch_ndim - n_y_axis_to_merge], y_shape[-2], -1) |
| 273 | + ) |
| 274 | + dims_were_merged = True |
| 275 | + |
| 276 | + # Squeeze x_dims corresponding to merged dimensions of y |
| 277 | + x_axis_to_squeeze = np.array(y_axis_to_merge) |
| 278 | + for i in reversed(x_axis_to_merge): |
| 279 | + # The corresponding dimensions of y may have shifted when we merged dimensions of x |
| 280 | + x_axis_to_squeeze[x_axis_to_squeeze > i] -= 1 |
| 281 | + x_stacked = x_stacked.squeeze(tuple(x_axis_to_squeeze)) |
| 282 | + |
| 283 | + # Same for y |
| 284 | + y_axis_to_squeeze = np.array(x_axis_to_merge) |
| 285 | + for i in reversed(y_axis_to_merge): |
| 286 | + y_axis_to_squeeze[y_axis_to_squeeze > i] -= 1 |
| 287 | + y_stacked = y_stacked.squeeze(tuple(y_axis_to_squeeze)) |
| 288 | + |
| 289 | + out_stacked = x_stacked @ y_stacked |
| 290 | + |
| 291 | + # Split back any merged dimensions |
| 292 | + if dims_were_merged: |
| 293 | + x_merged_shapes = [x_shape[i] for i in x_axis_to_merge] |
| 294 | + if not x_is_row: |
| 295 | + # Otherwise we handle that later with expand_dims, which is cleaner |
| 296 | + x_merged_shapes.append(x_shape[-2]) |
| 297 | + y_merged_shapes = [y_shape[i] for i in y_axis_to_merge] |
| 298 | + if not y_is_col: |
| 299 | + # Otherwise we handle that later with expand_dims, which is cleaner |
| 300 | + y_merged_shapes.append(y_shape[-1]) |
| 301 | + out_stacked_shape = tuple(out_stacked.shape) |
| 302 | + out_unstacked = out_stacked.reshape( |
| 303 | + ( |
| 304 | + *out_stacked_shape[: batch_ndim - n_axis_to_merge], |
| 305 | + *x_merged_shapes, |
| 306 | + *y_merged_shapes, |
| 307 | + ) |
| 308 | + ) |
| 309 | + else: |
| 310 | + out_unstacked = out_stacked |
| 311 | + |
| 312 | + # Add back dummy row, col axis |
| 313 | + # We do this separately to avoid the reshape as much as we can |
| 314 | + if y_is_col and (n_y_axis_to_merge or dims_were_merged): |
| 315 | + out_unstacked = expand_dims(out_unstacked, -1) |
| 316 | + if x_is_row and (n_x_axis_to_merge or dims_were_merged): |
| 317 | + out_unstacked = expand_dims(out_unstacked, -n_y_axis_to_merge - 2) |
| 318 | + |
| 319 | + # Move batch axis back to their original location |
| 320 | + source = range(-n_axis_to_merge - 2, 0) |
| 321 | + destination = (*x_axis_to_merge, -2, *y_axis_to_merge, -1) |
| 322 | + out = moveaxis(out_unstacked, source, destination) |
| 323 | + return [out] |
| 324 | + |
| 325 | + |
| 326 | +@register_canonicalize |
| 327 | +@node_rewriter(tracks=[_matmul]) |
| 328 | +def local_batched_matmul_to_core_matmul(fgraph, node): |
| 329 | + # Allow passing batch dimensions of matmul to core vector / column matrices |
| 330 | + return _batched_matmul_to_core_matmul(fgraph, node, allow_reshape=False) |
| 331 | + |
| 332 | + |
| 333 | +@register_specialize |
| 334 | +@node_rewriter(tracks=[_matmul]) |
| 335 | +def local_batched_matmul_to_core_matmul_with_reshape(fgraph, node): |
| 336 | + # Allow stacking batch dimensions of matmul with core dimensions, with a reshape operation |
| 337 | + # We only apply this in specialize, because grahs with reshape are hard to work with |
| 338 | + return _batched_matmul_to_core_matmul(fgraph, node, allow_reshape=True) |
249 | 339 |
|
250 | 340 |
|
251 | 341 | @register_canonicalize |
252 | 342 | @register_specialize |
253 | | -@node_rewriter([_inner_prod, _matrix_vec_prod, _vec_matrix_prod, _matrix_matrix_matmul]) |
| 343 | +@node_rewriter([_matmul]) |
254 | 344 | def local_blockwise_dot_to_mul(fgraph, node): |
255 | 345 | """Rewrite blockwise dots that correspond to multiplication without summation. |
256 | 346 |
|
|
0 commit comments