Skip to content

Commit 2edb549

Browse files
committed
Armadillo 15.0.2
1 parent eeb5697 commit 2edb549

File tree

6 files changed

+284
-7
lines changed

6 files changed

+284
-7
lines changed

ChangeLog

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
2025-09-08 Dirk Eddelbuettel <[email protected]>
2+
3+
* inst/include/current/: Armadillo 15.0.2
4+
15
2025-09-05 Dirk Eddelbuettel <[email protected]>
26

37
* inst/include/armadillo: Add full copyright header
@@ -7,7 +11,6 @@
711
* inst/include/armadillo: Moved back up from legacy/ to restore path
812
* inst/include/armadillo_bits/: Idem
913

10-
1114
2025-09-01 Dirk Eddelbuettel <[email protected]>
1215

1316
* DESCRIPTION (Version, Date): RcppArmadillo 15.0.1-1

inst/NEWS.Rd

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,16 @@
33
\newcommand{\ghpr}{\href{https://github.com/RcppCore/RcppArmadillo/pull/#1}{##1}}
44
\newcommand{\ghit}{\href{https://github.com/RcppCore/RcppArmadillo/issues/#1}{##1}}
55

6+
\section{Changes in RcppArmadillo version 15.0.2-1 (2025-09-08)}{
7+
\itemize{
8+
\item Upgraded to Armadillo release 15.0.2-1 (Medium Roast)
9+
\itemize{
10+
\item Optionally use OpenMP parallelisation for fp16 matrix multiplication
11+
\item Faster vectorisation of cube tubes
12+
}
13+
}
14+
}
15+
616
\section{Changes in RcppArmadillo version 15.0.1-1 (2025-09-01)}{
717
\itemize{
818
\item Upgraded to Armadillo release 15.0.1-1 (Medium Roast)

inst/include/current/armadillo_bits/arma_version.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
#define ARMA_VERSION_MAJOR 15
2525
#define ARMA_VERSION_MINOR 0
26-
#define ARMA_VERSION_PATCH 1
26+
#define ARMA_VERSION_PATCH 2
2727
#define ARMA_VERSION_NAME "Medium Roast"
2828

2929

inst/include/current/armadillo_bits/mul_gemm.hpp

Lines changed: 166 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,36 @@ struct gemm_emul_tinysq
5959

6060

6161

62+
struct gemm_emul_large_mp_helper
63+
{
64+
template<typename eT>
65+
arma_hot
66+
inline
67+
static
68+
void
69+
copy_row(eT* out_mem, const Mat<eT>& in, const uword row)
70+
{
71+
const uword n_rows = in.n_rows;
72+
const uword n_cols = in.n_cols;
73+
74+
const eT* in_mem_row = in.memptr() + row;
75+
76+
for(uword i=0; i < n_cols; ++i)
77+
{
78+
out_mem[i] = (*in_mem_row);
79+
80+
in_mem_row += n_rows;
81+
}
82+
}
83+
};
84+
85+
86+
87+
#if defined(ARMA_USE_OPENMP)
6288
//! emulation of gemm(), for non-complex matrices only, as it assumes only simple transposes (ie. doesn't do hermitian transposes)
89+
//! parallelised version
6390
template<const bool do_trans_A=false, const bool do_trans_B=false, const bool use_alpha=false, const bool use_beta=false>
64-
struct gemm_emul_large
91+
struct gemm_emul_large_mp
6592
{
6693
template<typename eT, typename TA, typename TB>
6794
arma_hot
@@ -78,13 +105,151 @@ struct gemm_emul_large
78105
)
79106
{
80107
arma_debug_sigprint();
108+
109+
const uword A_n_rows = A.n_rows;
110+
const uword A_n_cols = A.n_cols;
111+
112+
const uword B_n_rows = B.n_rows;
113+
const uword B_n_cols = B.n_cols;
114+
115+
if( (do_trans_A == false) && (do_trans_B == false) )
116+
{
117+
const uword n_threads = uword(mp_thread_limit::get());
118+
119+
podarray<eT> tmp(A_n_cols * n_threads, arma_nozeros_indicator());
120+
121+
eT* tmp_mem = tmp.memptr();
122+
123+
#pragma omp parallel for schedule(static) num_threads(int(n_threads))
124+
for(uword row_A=0; row_A < A_n_rows; ++row_A)
125+
{
126+
const uword thread_id = uword(omp_get_thread_num());
127+
128+
eT* A_rowdata = tmp_mem + (A_n_cols * thread_id);
129+
130+
gemm_emul_large_mp_helper::copy_row(A_rowdata, A, row_A);
131+
132+
for(uword col_B=0; col_B < B_n_cols; ++col_B)
133+
{
134+
const eT acc = op_dot::direct_dot(B_n_rows, A_rowdata, B.colptr(col_B));
135+
136+
if( (use_alpha == false) && (use_beta == false) ) { C.at(row_A,col_B) = acc; }
137+
else if( (use_alpha == true ) && (use_beta == false) ) { C.at(row_A,col_B) = alpha*acc; }
138+
else if( (use_alpha == false) && (use_beta == true ) ) { C.at(row_A,col_B) = acc + beta*C.at(row_A,col_B); }
139+
else if( (use_alpha == true ) && (use_beta == true ) ) { C.at(row_A,col_B) = alpha*acc + beta*C.at(row_A,col_B); }
140+
}
141+
}
142+
}
143+
else
144+
if( (do_trans_A == true) && (do_trans_B == false) )
145+
{
146+
const int n_threads = mp_thread_limit::get();
147+
148+
#pragma omp parallel for schedule(static) num_threads(n_threads)
149+
for(uword col_A=0; col_A < A_n_cols; ++col_A)
150+
{
151+
// col_A is interpreted as row_A when storing the results in matrix C
152+
153+
const eT* A_coldata = A.colptr(col_A);
154+
155+
for(uword col_B=0; col_B < B_n_cols; ++col_B)
156+
{
157+
const eT acc = op_dot::direct_dot(B_n_rows, A_coldata, B.colptr(col_B));
158+
159+
if( (use_alpha == false) && (use_beta == false) ) { C.at(col_A,col_B) = acc; }
160+
else if( (use_alpha == true ) && (use_beta == false) ) { C.at(col_A,col_B) = alpha*acc; }
161+
else if( (use_alpha == false) && (use_beta == true ) ) { C.at(col_A,col_B) = acc + beta*C.at(col_A,col_B); }
162+
else if( (use_alpha == true ) && (use_beta == true ) ) { C.at(col_A,col_B) = alpha*acc + beta*C.at(col_A,col_B); }
163+
}
164+
}
165+
}
166+
else
167+
if( (do_trans_A == false) && (do_trans_B == true) )
168+
{
169+
Mat<eT> BB;
170+
op_strans::apply_mat_noalias(BB, B);
171+
172+
gemm_emul_large_mp<false, false, use_alpha, use_beta>::apply(C, A, BB, alpha, beta);
173+
}
174+
else
175+
if( (do_trans_A == true) && (do_trans_B == true) )
176+
{
177+
// using trans(A)*trans(B) = trans(B*A) equivalency; assuming no hermitian transpose
178+
179+
const uword n_threads = uword(mp_thread_limit::get());
180+
181+
podarray<eT> tmp(B_n_cols * n_threads, arma_nozeros_indicator());
182+
183+
eT* tmp_mem = tmp.memptr();
184+
185+
#pragma omp parallel for schedule(static) num_threads(int(n_threads))
186+
for(uword row_B=0; row_B < B_n_rows; ++row_B)
187+
{
188+
const uword thread_id = uword(omp_get_thread_num());
189+
190+
eT* B_rowdata = tmp_mem + (B_n_cols * thread_id);
191+
192+
gemm_emul_large_mp_helper::copy_row(B_rowdata, B, row_B);
193+
194+
for(uword col_A=0; col_A < A_n_cols; ++col_A)
195+
{
196+
const eT acc = op_dot::direct_dot(A_n_rows, B_rowdata, A.colptr(col_A));
197+
198+
if( (use_alpha == false) && (use_beta == false) ) { C.at(col_A,row_B) = acc; }
199+
else if( (use_alpha == true ) && (use_beta == false) ) { C.at(col_A,row_B) = alpha*acc; }
200+
else if( (use_alpha == false) && (use_beta == true ) ) { C.at(col_A,row_B) = acc + beta*C.at(col_A,row_B); }
201+
else if( (use_alpha == true ) && (use_beta == true ) ) { C.at(col_A,row_B) = alpha*acc + beta*C.at(col_A,row_B); }
202+
}
203+
}
204+
}
205+
}
206+
207+
};
208+
#endif
209+
210+
81211

212+
//! emulation of gemm(), for non-complex matrices only, as it assumes only simple transposes (ie. doesn't do hermitian transposes)
213+
template<const bool do_trans_A=false, const bool do_trans_B=false, const bool use_alpha=false, const bool use_beta=false>
214+
struct gemm_emul_large
215+
{
216+
template<typename eT, typename TA, typename TB>
217+
arma_hot
218+
inline
219+
static
220+
void
221+
apply
222+
(
223+
Mat<eT>& C,
224+
const TA& A,
225+
const TB& B,
226+
const eT alpha = eT(1),
227+
const eT beta = eT(0)
228+
)
229+
{
230+
arma_debug_sigprint();
231+
82232
const uword A_n_rows = A.n_rows;
83233
const uword A_n_cols = A.n_cols;
84234

85235
const uword B_n_rows = B.n_rows;
86236
const uword B_n_cols = B.n_cols;
87237

238+
#if defined(ARMA_USE_OPENMP)
239+
{
240+
// TODO: replace with more sophisticated threshold mechanism
241+
242+
constexpr uword threshold = uword(30);
243+
244+
if( (A_n_rows >= threshold) && (A_n_cols >= threshold) && (B_n_rows >= threshold) && (B_n_cols >= threshold) && (mp_thread_limit::in_parallel() == false) )
245+
{
246+
gemm_emul_large_mp<do_trans_A, do_trans_B, use_alpha, use_beta>::apply(C,A,B,alpha,beta);
247+
248+
return;
249+
}
250+
}
251+
#endif
252+
88253
if( (do_trans_A == false) && (do_trans_B == false) )
89254
{
90255
arma_aligned podarray<eT> tmp(A_n_cols);

inst/include/current/armadillo_bits/mul_gemv.hpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,74 @@ struct gemv_emul_helper
203203

204204

205205

206+
#if defined(ARMA_USE_OPENMP)
207+
//! Partial emulation of BLAS gemv().
208+
//! 'y' is assumed to have been set to the correct size (ie. taking into account the transpose)
209+
//! parallelised version
210+
template<const bool do_trans_A=false, const bool use_alpha=false, const bool use_beta=false>
211+
struct gemv_emul_mp
212+
{
213+
template<typename eT, typename TA>
214+
arma_hot
215+
inline
216+
static
217+
void
218+
apply( eT* y, const TA& A, const eT* x, const eT alpha = eT(1), const eT beta = eT(0) )
219+
{
220+
arma_debug_sigprint();
221+
222+
const int n_threads = mp_thread_limit::get();
223+
224+
const uword A_n_rows = A.n_rows;
225+
const uword A_n_cols = A.n_cols;
226+
227+
if(do_trans_A == false)
228+
{
229+
#pragma omp parallel for schedule(static) num_threads(n_threads)
230+
for(uword row=0; row < A_n_rows; ++row)
231+
{
232+
const eT acc = gemv_emul_helper::dot_row_col(A, x, row, A_n_cols);
233+
234+
if( (use_alpha == false) && (use_beta == false) ) { y[row] = acc; }
235+
else if( (use_alpha == true ) && (use_beta == false) ) { y[row] = alpha*acc; }
236+
else if( (use_alpha == false) && (use_beta == true ) ) { y[row] = acc + beta*y[row]; }
237+
else if( (use_alpha == true ) && (use_beta == true ) ) { y[row] = alpha*acc + beta*y[row]; }
238+
}
239+
}
240+
else
241+
if(do_trans_A == true)
242+
{
243+
if(is_cx<eT>::no)
244+
{
245+
#pragma omp parallel for schedule(static) num_threads(n_threads)
246+
for(uword col=0; col < A_n_cols; ++col)
247+
{
248+
// col is interpreted as row when storing the results in 'y'
249+
250+
const eT acc = op_dot::direct_dot(A_n_rows, A.colptr(col), x);
251+
252+
if( (use_alpha == false) && (use_beta == false) ) { y[col] = acc; }
253+
else if( (use_alpha == true ) && (use_beta == false) ) { y[col] = alpha*acc; }
254+
else if( (use_alpha == false) && (use_beta == true ) ) { y[col] = acc + beta*y[col]; }
255+
else if( (use_alpha == true ) && (use_beta == true ) ) { y[col] = alpha*acc + beta*y[col]; }
256+
}
257+
}
258+
else
259+
{
260+
Mat<eT> AA;
261+
262+
op_htrans::apply_mat_noalias(AA, A);
263+
264+
gemv_emul_mp<false, use_alpha, use_beta>::apply(y, AA, x, alpha, beta);
265+
}
266+
}
267+
}
268+
269+
};
270+
#endif
271+
272+
273+
206274
//! \brief
207275
//! Partial emulation of BLAS gemv().
208276
//! 'y' is assumed to have been set to the correct size (ie. taking into account the transpose)
@@ -222,6 +290,21 @@ struct gemv_emul
222290
const uword A_n_rows = A.n_rows;
223291
const uword A_n_cols = A.n_cols;
224292

293+
#if defined(ARMA_USE_OPENMP)
294+
{
295+
// TODO: replace with more sophisticated threshold mechanism
296+
297+
constexpr uword threshold = uword(200);
298+
299+
if( (A_n_rows >= threshold) && (A_n_cols >= threshold) && (mp_thread_limit::in_parallel() == false) )
300+
{
301+
gemv_emul_mp<do_trans_A, use_alpha, use_beta>::apply(y, A, x, alpha, beta);
302+
303+
return;
304+
}
305+
}
306+
#endif
307+
225308
if(do_trans_A == false)
226309
{
227310
if(A_n_rows == 1)

inst/include/current/armadillo_bits/op_vectorise_meat.hpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,14 +365,30 @@ op_vectorise_cube_col::apply_subview(Mat<eT>& out, const subview_cube<eT>& sv)
365365

366366
out.set_size(sv.n_elem, 1);
367367

368+
if(sv.n_elem == 0) { return; }
369+
368370
eT* out_mem = out.memptr();
369371

370-
for(uword s=0; s < sv_ns; ++s)
371-
for(uword c=0; c < sv_nc; ++c)
372+
if( (sv_nr == 1) && (sv_nc == 1) && (sv.aux_slice1 == 0) )
372373
{
373-
arrayops::copy(out_mem, sv.slice_colptr(s,c), sv_nr);
374+
const uword sv_m_n_elem_slice = sv.m.n_elem_slice;
375+
376+
const eT* sv_m_ptr = &( sv.m.at(sv.aux_row1, sv.aux_col1, 0) );
374377

375-
out_mem += sv_nr;
378+
for(uword s=0; s < sv_ns; ++s)
379+
{
380+
out_mem[s] = (*sv_m_ptr); sv_m_ptr += sv_m_n_elem_slice;
381+
}
382+
}
383+
else
384+
{
385+
for(uword s=0; s < sv_ns; ++s)
386+
for(uword c=0; c < sv_nc; ++c)
387+
{
388+
arrayops::copy(out_mem, sv.slice_colptr(s,c), sv_nr);
389+
390+
out_mem += sv_nr;
391+
}
376392
}
377393
}
378394

0 commit comments

Comments
 (0)