@@ -238,6 +238,46 @@ perf(128, 8, 128, 32)
238
238
# 3.166 ms (81 allocations: 9.13 MiB)
239
239
# 16.082 ms (1049 allocations: 20.58 MiB)
240
240
241
+ # # Threadripper, NNlib v0.8.12
242
+ # tullio
243
+ # 5.658 ms (77 allocations: 7.25 MiB)
244
+ # 22.373 ms (1124 allocations: 16.71 MiB)
245
+ # nalib
246
+ # 6.187 ms (89 allocations: 7.75 MiB)
247
+ # 23.723 ms (604 allocations: 14.70 MiB)
248
+ # nnlib
249
+ # 6.473 ms (87 allocations: 9.25 MiB)
250
+ # 24.966 ms (1055 allocations: 20.71 MiB)
251
+ # tullio - gpu
252
+ # 145.332 μs (520 allocations: 24.52 KiB)
253
+ # 902.020 μs (2221 allocations: 117.19 KiB)
254
+ # nalib - gpu
255
+ # 162.354 μs (410 allocations: 18.03 KiB)
256
+ # 604.111 μs (1263 allocations: 71.78 KiB)
257
+ # nnlib - gpu
258
+ # 156.383 μs (440 allocations: 20.00 KiB)
259
+ # 835.374 μs (1969 allocations: 100.58 KiB)
260
+
261
+ # # Threadripper, NNlib v0.8.13 (fast_maximum)
262
+ # tullio
263
+ # 4.599 ms (71 allocations: 7.13 MiB)
264
+ # 20.699 ms (1118 allocations: 16.59 MiB)
265
+ # nalib
266
+ # 5.049 ms (84 allocations: 7.63 MiB)
267
+ # 22.252 ms (599 allocations: 14.57 MiB)
268
+ # nnlib
269
+ # 5.378 ms (81 allocations: 9.13 MiB)
270
+ # 23.453 ms (1049 allocations: 20.58 MiB)
271
+ # tullio - gpu
272
+ # 145.824 μs (520 allocations: 24.52 KiB)
273
+ # 915.305 μs (2221 allocations: 117.19 KiB)
274
+ # nalib - gpu
275
+ # 164.789 μs (410 allocations: 18.03 KiB)
276
+ # 610.835 μs (1263 allocations: 71.78 KiB)
277
+ # nnlib - gpu
278
+ # 157.785 μs (440 allocations: 20.00 KiB)
279
+ # 852.087 μs (1969 allocations: 100.58 KiB)
280
+
241
281
242
282
# function prof()
243
283
# dim, len, batch_size, nheads = 128, 8, 128, 32;
0 commit comments