Skip to content

Commit 595f073

Browse files
I think the GPU implementation mutate is now working, but it is so slow compared with running on the host.
1 parent 42e0f0e commit 595f073

File tree

5 files changed

+172
-116
lines changed

5 files changed

+172
-116
lines changed

include/arrays.h

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1469,7 +1469,7 @@ namespace ecolab
14691469
template <class T, class A=std::allocator<T>>
14701470
class array
14711471
{
1472-
array_data<T> *dt;
1472+
array_data<T> *dt=nullptr;
14731473
A m_allocator;
14741474

14751475

@@ -1492,6 +1492,10 @@ namespace ecolab
14921492
#endif
14931493
return nullptr; // SYCL allocator returns nullptr if not initialised
14941494
}
1495+
// #ifdef __SYCL_DEVICE_ONLY__
1496+
// syclPrintf("succeeded in allocating %d bytes in array\n",sizeof(T)*n);
1497+
//#endif
1498+
14951499
#ifdef __ICC
14961500
// we need to align data onto 16 byte boundaries
14971501
size_t d = (size_t)(reinterpret_cast<array_data<T>*>(p)->dt);
@@ -1557,15 +1561,37 @@ namespace ecolab
15571561

15581562
protected:
15591563

1564+
// implements copying data from x to this in a GPU friendly way
1565+
template <class E>
1566+
void asgV(const A& alloc, size_t size, const E& x)
1567+
{
1568+
#ifdef __SYCL_DEVICE_ONLY__
1569+
GroupLocal<array> tmp(size,alloc);
1570+
array_ns::asg_v(tmp->dt->dt,size,x);
1571+
groupBarrier();
1572+
if (syclGroup().leader()) swap(*tmp);
1573+
#else
1574+
array tmp(size,alloc);
1575+
asg_v(tmp.data(),size,x);
1576+
swap(tmp);
1577+
#endif
1578+
}
1579+
15601580
void copy() //any nonconst method needs to call this
15611581
{ // to implement copy-on-write semantics
15621582
if (dt && ref()>1)
15631583
{
1584+
#ifdef __SYCL_DEVICE_ONLY__
1585+
syclPrintf("b4 asgV in copy\n");
1586+
asgV(m_allocator, size(), dt->dt);
1587+
syclPrintf("after asgV in copy\n");
1588+
#else
15641589
array_data<T>* oldData=dt;
15651590
bool freeMem = ref()-- == 0;
1566-
dt=alloc(size());
1591+
dt=alloc(size());
15671592
memcpy(dt->dt,oldData->dt,size()*sizeof(T));
15681593
if (freeMem) free(oldData);
1594+
#endif
15691595
}
15701596
}
15711597

@@ -1574,7 +1600,7 @@ namespace ecolab
15741600
typedef size_t size_type;
15751601
using Allocator=A;
15761602

1577-
array(const Allocator& alloc={}): m_allocator(alloc) {set_size(0);}
1603+
array(const Allocator& alloc={}): m_allocator(alloc) {}
15781604
explicit array(size_t s, const Allocator& alloc=Allocator()): m_allocator(alloc)
15791605
{
15801606
set_size(s);
@@ -1605,11 +1631,12 @@ namespace ecolab
16051631
const Allocator& allocator() const {return m_allocator;}
16061632
const Allocator& allocator(const Allocator& alloc) {
16071633
if (alloc==m_allocator) return m_allocator;
1608-
array tmp(size(),alloc);
1609-
asg_v(tmp.data(),size(),data());
1610-
swap(tmp);
1634+
asgV(alloc, size(), data());
16111635
return m_allocator;
16121636
}
1637+
1638+
/// current value of the reference counter
1639+
unsigned refCnt() const {return dt->cnt;}
16131640

16141641
/// resize array to \a s elements
16151642
void resize(size_t s) {
@@ -1645,33 +1672,18 @@ namespace ecolab
16451672

16461673
array& operator=(const array& x) {
16471674
if (x.dt==dt) return *this;
1648-
if (m_allocator==x.m_allocator) { // shared data optimisation
1649-
release();
1650-
dt=x.dt;
1651-
if (dt) ref()++;
1652-
return *this;
1653-
}
1654-
array tmp(x.size(),m_allocator);
1655-
array_ns::asg_v(tmp.data(),tmp.size(),x);
1656-
swap(tmp);
1675+
release();
1676+
m_allocator=x.m_allocator;
1677+
dt=x.dt;
1678+
if (dt) ref()++;
16571679
return *this;
16581680
}
16591681

16601682
template <class expr> typename
16611683
enable_if<is_expression<expr>, array&>::T
16621684
operator=(const expr& x) {
16631685
if ((void*)(&x)==(void*)(this)) return *this;
1664-
// since expression x may contain a reference to this, assign to a temporary
1665-
#ifdef __SYCL_DEVICE_ONLY__
1666-
GroupLocal<array> tmp(x.size(),m_allocator);
1667-
array_ns::asg_v(tmp.ref().data(),x.size(),x);
1668-
groupBarrier();
1669-
if (syclGroup().leader()) swap(tmp.ref());
1670-
#else
1671-
array tmp(x.size(),m_allocator);
1672-
array_ns::asg_v(tmp.data(),tmp.size(),x);
1673-
swap(tmp);
1674-
#endif
1686+
asgV(m_allocator, x.size(), x);
16751687
return *this;
16761688
}
16771689
template <class expr> typename

include/ecolab.h

Lines changed: 47 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -205,57 +205,52 @@ namespace ecolab
205205

206206
struct CellBase
207207
{
208+
size_t m_idx=0; // stash the position within the local node vector here
209+
size_t idx() const {return m_idx;}
208210
#ifdef SYCL_LANGUAGE_VERSION
209-
Ouro::SyclDesc<1>* desc=nullptr;
210211
using MemAllocator=Ouro::MultiOuroPQ;
211212
MemAllocator* memAlloc=nullptr;
212-
const sycl::stream* out=nullptr;
213-
size_t idx() const {
214-
if (desc) return desc->item.get_global_linear_id();
215-
else return 0;
216-
}
217213
template <class T> class CellAllocator
218214
{
219215
public:
220-
Ouro::SyclDesc<1>*const* desc=nullptr;
221-
MemAllocator*const* memAlloc=nullptr;
216+
MemAllocator* memAlloc=nullptr;
222217
template <class U> friend class Allocator;
223218
CellAllocator()=default;
224-
CellAllocator(Ouro::SyclDesc<1>* const& desc, MemAllocator*const& memAlloc):
225-
desc(&desc), memAlloc(&memAlloc) {
226-
}
227-
template <class U> CellAllocator(const CellAllocator<U>& x):
228-
desc(x.desc), memAlloc(x.memAlloc) {}
219+
CellAllocator(MemAllocator* memAlloc): memAlloc(memAlloc) {}
220+
template <class U> CellAllocator(const CellAllocator<U>& x): memAlloc(x.memAlloc) {}
229221
T* allocate(size_t sz) {
230222
#ifdef __SYCL_DEVICE_ONLY__
231-
if (memAlloc && *memAlloc && desc && *desc) {
232-
auto r=reinterpret_cast<T*>((*memAlloc)->malloc(**desc,sz*sizeof(T)));
223+
if (memAlloc) {
224+
auto r=reinterpret_cast<T*>(memAlloc->malloc(Ouro::SyclDesc<>(syclItem(),{}),sz*sizeof(T)));
233225
if (!r) syclPrintf("Mem allocation failed\n");
234226
return r;
235227
}
228+
syclPrintf("Missing allocator memAlloc=%x\n",memAlloc);
236229
return nullptr; // TODO raise an error??
237230
#else
238231
return sycl::malloc_shared<T>(sz,syclQ());
239232
#endif
240233
}
241-
void deallocate(T* p,size_t) {
234+
void deallocate(T* p,size_t n) {
235+
if (!p) return;
242236
#ifdef __SYCL_DEVICE_ONLY__
243-
if (memAlloc && *memAlloc && desc && *desc)
244-
(*memAlloc)->free(**desc,p);
237+
if (memAlloc && reinterpret_cast<Ouro::memory_t*>(p)>=memAlloc->memory.d_data &&
238+
reinterpret_cast<Ouro::memory_t*>(p)<memAlloc->memory.d_data_end)
239+
memAlloc->free(Ouro::SyclDesc<>(syclItem(),{}),p);
240+
else
241+
syclPrintf("leaked %d bytes\n",n*sizeof(T));
245242
#else
246243
sycl::free(p,syclQ());
247244
#endif
248245
}
249-
bool operator==(const CellAllocator& x) const {return desc==x.desc && memAlloc==x.memAlloc;}
246+
bool operator==(const CellAllocator& x) const {return memAlloc==x.memAlloc;}
250247
};
251248
template <class T> CellAllocator<T> allocator() const {
252-
return CellAllocator<T>(desc,memAlloc);
249+
return CellAllocator<T>(memAlloc);
253250
}
254-
#else
251+
#else //!SYCL
255252
template <class T> using CellAllocator=std::allocator<T>;
256253
template <class T> CellAllocator<T> allocator() const {return CellAllocator<T>();}
257-
size_t m_idx=0; // stash the position within the local node vector here
258-
size_t idx() const {return m_idx;}
259254
#endif
260255
};
261256

@@ -304,6 +299,21 @@ namespace ecolab
304299
graphcode::Allocator<graphcode::ObjectPtr<Cell>>(syclQ(),sycl::usm::alloc::shared)) {}
305300
~EcolabGraph() {syncThreads();}
306301
#endif
302+
/// apply a functional to all local cells of this processor using the host processor
303+
/// @param f
304+
template <class F>
305+
void hostForAll(F f) {
306+
auto sz=this->size();
307+
#ifdef _OPENMP
308+
#pragma omp parallel for
309+
#endif
310+
for (size_t idx=0; idx<sz; ++idx) {
311+
auto& cell=*(*this)[idx]->template as<Cell>();
312+
cell.m_idx=idx;
313+
f(cell);
314+
}
315+
}
316+
307317
/// apply a functional to all local cells of this processor in parallel
308318
/// @param f
309319
template <class F>
@@ -313,34 +323,17 @@ namespace ecolab
313323
size_t range=this->size()/workGroupSize;
314324
if (range*workGroupSize < this->size()) ++range;
315325
syclQ().submit([&](auto& h) {
316-
#ifndef NDEBUG
317-
sycl::stream out(1000000,1000,h);
318-
#endif
319326
h.parallel_for(sycl::nd_range<1>(range*workGroupSize, workGroupSize), [=,this](auto i) {
320327
auto idx=i.get_global_linear_id();
321328
if (idx<this->size()) {
322329
auto& cell=*(*this)[idx]->template as<Cell>();
323-
Ouro::SyclDesc<> desc(i,{});
324-
cell.desc=&desc;
325-
#ifndef NDEBUG
326-
cell.out=&out;
327-
#endif
330+
cell.m_idx=idx;
328331
f(cell);
329-
cell.desc=nullptr;
330-
cell.out=nullptr;
331332
}
332333
});
333334
});
334335
#else
335-
auto sz=this->size();
336-
#ifdef _OPENMP
337-
#pragma omp parallel for
338-
#endif
339-
for (size_t idx=0; idx<sz; ++idx) {
340-
auto& cell=*(*this)[idx]->template as<Cell>();
341-
cell.m_idx=idx;
342-
f(cell);
343-
}
336+
hostForAll(f);
344337
#endif
345338
}
346339

@@ -351,28 +344,20 @@ namespace ecolab
351344
template <class F>
352345
void groupedForAll(F f) {
353346
#ifdef SYCL_LANGUAGE_VERSION
347+
// TODO - pass in workGroupSize as an optional parameter??
354348
static size_t workGroupSize=32;//syclQ().get_device().get_info<sycl::info::device::max_work_group_size>();
355349
syclQ().submit([&](auto& h) {
356-
#ifndef NDEBUG
357-
sycl::stream out(1000000,1000,h);
358-
#endif
359350
h.parallel_for(sycl::nd_range<1>(this->size()*workGroupSize, workGroupSize), [=,this](auto i) {
360351
auto idx=i.get_group_linear_id();
361352
if (idx<this->size()) {
362353
auto& cell=*(*this)[idx]->template as<Cell>();
363-
Ouro::SyclDesc<> desc(i,{});
364-
cell.desc=&desc;
365-
#ifndef NDEBUG
366-
cell.out=&out;
367-
#endif
354+
cell.m_idx=idx;
368355
f(cell);
369-
cell.desc=nullptr;
370-
cell.out=nullptr;
371356
}
372357
});
373358
});
374359
#else
375-
forAll(f);
360+
hostForAll(f);
376361
#endif
377362
}
378363

@@ -423,10 +408,17 @@ namespace ecolab
423408
~GroupLocal() {
424409
sycl::group_barrier(syclGroup());
425410
if (syclGroup().leader())
426-
ref().~T();
411+
(**this).~T();
427412
}
428-
T& ref() {return reinterpret_cast<T&>(**buffer);}
413+
T& operator*() {return reinterpret_cast<T&>(**buffer);}
414+
#else
415+
T buffer;
416+
public:
417+
template <class... Args>
418+
GroupLocal(Args&&... args): buffer(std::forward<Args>(args)...) {}
419+
T& operator*() {return buffer;}
429420
#endif
421+
T* operator->() {return &**this;}
430422
};
431423

432424
#ifdef __SYCL_DEVICE_ONLY__

0 commit comments

Comments
 (0)