diff --git a/src/stats/CMakeLists.txt b/src/stats/CMakeLists.txt index 3e5727565..3a9553c70 100644 --- a/src/stats/CMakeLists.txt +++ b/src/stats/CMakeLists.txt @@ -1,5 +1,9 @@ set(stats_cppFiles ../stdlib_linalg_constants.fypp + ../stdlib_sorting.fypp + ../stdlib_sorting_ord_sort.fypp + ../stdlib_sorting_sort_adjoint.fypp + ../stdlib_sorting_sort.fypp ) set(stats_fppFiles @@ -10,12 +14,15 @@ set(stats_fppFiles ../stdlib_error.fypp ../stdlib_linalg.fypp ../stdlib_linalg_diag.fypp + ../stdlib_linalg_eigenvalues.fypp ../stdlib_linalg_state.fypp + ../stdlib_linalg_svd.fypp ../stdlib_random.fypp ../stdlib_selection.fypp ../stdlib_string_type.fypp stdlib_stats_corr.fypp stdlib_stats_cov.fypp + stdlib_stats_pca.fypp stdlib_stats_distribution_exponential.fypp stdlib_stats_distribution_normal.fypp stdlib_stats_distribution_uniform.fypp @@ -29,4 +36,9 @@ set(stats_fppFiles stdlib_stats_var.fypp ) +set(f90Files + ../stdlib_sorting_radix_sort.f90 + ) + configure_stdlib_target(stats "" stats_fppFiles stats_cppFiles) +target_link_libraries(stats PUBLIC blas lapack $<$>:bitsets>) diff --git a/src/stats/stdlib_stats.fypp b/src/stats/stdlib_stats.fypp index adf373f0a..79774c163 100644 --- a/src/stats/stdlib_stats.fypp +++ b/src/stats/stdlib_stats.fypp @@ -9,10 +9,12 @@ module stdlib_stats !! ([Specification](../page/specs/stdlib_stats.html)) use stdlib_kinds, only: sp, dp, xdp, qp, & int8, int16, int32, int64 + use stdlib_linalg_state, only: linalg_state_type implicit none private ! Public API public :: corr, cov, mean, median, moment, var + public :: pca, pca_transform, pca_inverse_transform interface corr @@ -639,4 +641,62 @@ module stdlib_stats #:endfor end interface moment + + #! Note: PCA is limited to single (sp) and double (dp) precision because external + #! optimized BLAS/LAPACK libraries (OpenBLAS, MKL) only support these precisions. + #! Extended (xdp) and quadruple (qp) precision are not supported for PCA. + #:set PCA_KINDS_TYPES = [("sp", "real(sp)"), ("dp", "real(dp)")] + + interface pca + !! version: experimental + !! + !! Principal Component Analysis (PCA) + !! ([Specification](../page/specs/stdlib_stats.html#pca)) + #:for k1, t1 in PCA_KINDS_TYPES + module subroutine pca_${k1}$(x, components, singular_values, x_mean, & + method, overwrite_x, err) + ${t1}$, intent(inout), target :: x(:,:) + ${t1}$, intent(out) :: components(:,:) + real(${k1}$), intent(out) :: singular_values(:) + ${t1}$, intent(out), optional :: x_mean(:) + character(*), intent(in), optional :: method + logical, intent(in), optional :: overwrite_x + type(linalg_state_type), intent(out), optional :: err + end subroutine pca_${k1}$ + #:endfor + end interface pca + + + interface pca_transform + !! version: experimental + !! + !! Projects data into the reduced dimensional space + !! ([Specification](../page/specs/stdlib_stats.html#pca_transform)) + #:for k1, t1 in PCA_KINDS_TYPES + module subroutine pca_transform_${k1}$(x, components, x_mean, x_transformed) + ${t1}$, intent(in) :: x(:,:) + ${t1}$, intent(in) :: components(:,:) + ${t1}$, intent(in), optional :: x_mean(:) + ${t1}$, intent(out) :: x_transformed(:,:) + end subroutine pca_transform_${k1}$ + #:endfor + end interface pca_transform + + + interface pca_inverse_transform + !! version: experimental + !! + !! Reconstructs original data from the reduced space + !! ([Specification](../page/specs/stdlib_stats.html#pca_inverse_transform)) + #:for k1, t1 in PCA_KINDS_TYPES + module subroutine pca_inverse_transform_${k1}$(x_reduced, components, x_mean, x_reconstructed) + ${t1}$, intent(in) :: x_reduced(:,:) + ${t1}$, intent(in) :: components(:,:) + ${t1}$, intent(in), optional :: x_mean(:) + ${t1}$, intent(out) :: x_reconstructed(:,:) + end subroutine pca_inverse_transform_${k1}$ + #:endfor + end interface pca_inverse_transform + + end module stdlib_stats diff --git a/src/stats/stdlib_stats_pca.fypp b/src/stats/stdlib_stats_pca.fypp new file mode 100644 index 000000000..057e2a34d --- /dev/null +++ b/src/stats/stdlib_stats_pca.fypp @@ -0,0 +1,191 @@ +#:include "common.fypp" +#! Note: PCA is limited to single (sp) and double (dp) precision because external +#! optimized BLAS/LAPACK libraries (OpenBLAS, MKL) only support these precisions. +#:set PCA_KINDS_TYPES = [("sp", "real(sp)"), ("dp", "real(dp)")] +submodule (stdlib_stats) stdlib_stats_pca + use stdlib_kinds, only: sp, dp + use stdlib_error, only: error_stop + use stdlib_optval, only: optval + use stdlib_linalg, only: svd, eigh + use stdlib_linalg_constants, only: ilp + use stdlib_linalg_blas, only: gemm + use stdlib_linalg_state, only: linalg_state_type, LINALG_ERROR + use stdlib_sorting, only: sort_index + implicit none + +contains + + #:for k1, t1 in PCA_KINDS_TYPES + module subroutine pca_${k1}$(x, components, singular_values, x_mean, & + method, overwrite_x, err) + ${t1}$, intent(inout), target :: x(:,:) + ${t1}$, intent(out) :: components(:,:) + real(${k1}$), intent(out) :: singular_values(:) + ${t1}$, intent(out), optional :: x_mean(:) + character(*), intent(in), optional :: method + logical, intent(in), optional :: overwrite_x + type(linalg_state_type), intent(out), optional :: err + + type(linalg_state_type) :: err0 + integer(ilp) :: n, p, i, j, k, m, n_s + ${t1}$, allocatable :: mu(:) + character(16) :: method_ + + n = size(x, 1, kind=ilp) + p = size(x, 2, kind=ilp) + k = size(components, 1, kind=ilp) + + method_ = optval(method, "svd") + + ! 1. Calculate mean using intrinsic sum (avoids submodule dependency issues) + allocate(mu(p)) + do j = 1, p + mu(j) = sum(x(:, j)) / real(n, ${k1}$) + end do + if (present(x_mean)) x_mean = mu + + if (method_ == "svd") then + ! 2. Center data and call SVD with temporaries for robustness + block + ${t1}$, allocatable :: s_tmp(:), vt_tmp(:,:) + n_s = min(n, p) + allocate(s_tmp(n_s), vt_tmp(n_s, p)) + + if (optval(overwrite_x, .false.)) then + do i = 1, n + x(i, :) = x(i, :) - mu + end do + call svd(x, s_tmp, vt=vt_tmp, overwrite_a=.true., full_matrices=.false., err=err0) + else + block + ${t1}$, allocatable :: x_centered(:,:) + allocate(x_centered(n, p)) + do i = 1, n + x_centered(i, :) = x(i, :) - mu + end do + call svd(x_centered, s_tmp, vt=vt_tmp, overwrite_a=.true., full_matrices=.false., err=err0) + end block + end if + + if (err0%ok()) then + m = min(size(components, 1, kind=ilp), n_s) + components(:m, :) = vt_tmp(:m, :) + m = min(size(singular_values, 1, kind=ilp), n_s) + singular_values(:m) = s_tmp(:m) + end if + end block + else if (method_ == "eig" .or. method_ == "cov") then + ! 3. Eigendecomposition of covariance matrix (computed inline) + block + ${t1}$, allocatable :: c(:,:), vectors(:,:), x_centered(:,:) + real(${k1}$), allocatable :: lambda(:), lambda_copy(:) + integer(ilp), allocatable :: idx(:) + real(${k1}$) :: scale_factor + + allocate(c(p, p), lambda(p), lambda_copy(p), idx(p), vectors(p, p)) + allocate(x_centered(n, p)) + + ! Center data + do i = 1, n + x_centered(i, :) = x(i, :) - mu + end do + + ! Compute covariance matrix: C = X^T * X / (n-1) + scale_factor = 1.0_${k1}$ / real(max(n-1, 1), ${k1}$) + do i = 1, p + do j = 1, p + c(i, j) = dot_product(x_centered(:, i), x_centered(:, j)) * scale_factor + end do + end do + + call eigh(c, lambda, vectors=vectors, err=err0) + + if (err0%ok()) then + ! Sort eigenvalues in descending order using stdlib_sorting + ! sort_index sorts in ascending order, so we negate values + lambda_copy = -lambda + call sort_index(lambda_copy, idx) + + ! Assign sorted results + m = min(size(components, 1, kind=ilp), p) + do i = 1, m + components(i, :) = vectors(:, idx(i)) + if (lambda(idx(i)) > 0.0_${k1}$) then + singular_values(i) = sqrt(lambda(idx(i)) * real(n-1, ${k1}$)) + else + singular_values(i) = 0.0_${k1}$ + end if + end do + end if + end block + else + err0 = linalg_state_type("pca", LINALG_ERROR, "Unknown method: "//method_) + end if + + ! Handle error state: return error or stop if err not present + call err0%handle(err) + + end subroutine pca_${k1}$ + #:endfor + + + #:for k1, t1 in PCA_KINDS_TYPES + module subroutine pca_transform_${k1}$(x, components, x_mean, x_transformed) + ${t1}$, intent(in) :: x(:,:) + ${t1}$, intent(in) :: components(:,:) + ${t1}$, intent(in), optional :: x_mean(:) + ${t1}$, intent(out) :: x_transformed(:,:) + + integer(ilp) :: i, n, p, nc + ${t1}$, allocatable :: x_centered(:,:) + ${t1}$, parameter :: alpha = 1.0_${k1}$, beta = 0.0_${k1}$ + + n = size(x, 1, kind=ilp) + p = size(x, 2, kind=ilp) + nc = size(components, 1, kind=ilp) + + allocate(x_centered(n, p)) + if (present(x_mean)) then + do i = 1, n + x_centered(i, :) = x(i, :) - x_mean + end do + else + x_centered = x + end if + + ! x_transformed = x_centered * components^T using GEMM + ! GEMM: C = alpha * op(A) * op(B) + beta * C + ! x_transformed(n, nc) = x_centered(n, p) * components(nc, p)^T + call gemm('N', 'T', n, nc, p, alpha, x_centered, n, components, nc, beta, x_transformed, n) + end subroutine pca_transform_${k1}$ + #:endfor + + + #:for k1, t1 in PCA_KINDS_TYPES + module subroutine pca_inverse_transform_${k1}$(x_reduced, components, x_mean, x_reconstructed) + ${t1}$, intent(in) :: x_reduced(:,:) + ${t1}$, intent(in) :: components(:,:) + ${t1}$, intent(in), optional :: x_mean(:) + ${t1}$, intent(out) :: x_reconstructed(:,:) + + integer(ilp) :: i, n, nc, p + ${t1}$, parameter :: alpha = 1.0_${k1}$, beta = 0.0_${k1}$ + + n = size(x_reduced, 1, kind=ilp) + nc = size(x_reduced, 2, kind=ilp) + p = size(components, 2, kind=ilp) + + ! x_reconstructed = x_reduced * components using GEMM + ! GEMM: C = alpha * op(A) * op(B) + beta * C + ! x_reconstructed(n, p) = x_reduced(n, nc) * components(nc, p) + call gemm('N', 'N', n, p, nc, alpha, x_reduced, n, components, nc, beta, x_reconstructed, n) + + if (present(x_mean)) then + do i = 1, n + x_reconstructed(i, :) = x_reconstructed(i, :) + x_mean + end do + end if + end subroutine pca_inverse_transform_${k1}$ + #:endfor + +end submodule stdlib_stats_pca diff --git a/test/stats/CMakeLists.txt b/test/stats/CMakeLists.txt index ff9d45063..3627508a0 100644 --- a/test/stats/CMakeLists.txt +++ b/test/stats/CMakeLists.txt @@ -8,12 +8,14 @@ set(fppFiles test_distribution_uniform.fypp test_distribution_normal.fypp test_distribution_exponential.fypp + test_pca.fypp ) fypp_f90("${fyppFlags}" "${fppFiles}" outFiles) ADDTEST(corr) ADDTEST(cov) +ADDTEST(pca) ADDTEST(mean) ADDTEST(median) ADDTEST(moment) diff --git a/test/stats/test_pca.fypp b/test/stats/test_pca.fypp new file mode 100644 index 000000000..2f0c8d05e --- /dev/null +++ b/test/stats/test_pca.fypp @@ -0,0 +1,55 @@ +#:include "common.fypp" +#:set PCA_KINDS = ["sp", "dp"] +program test_pca + use stdlib_error, only: check + use stdlib_kinds, only: sp, dp + use stdlib_stats, only: pca, pca_transform, pca_inverse_transform + use stdlib_linalg_state, only: linalg_state_type + implicit none + + #:for k1 in PCA_KINDS + real(${k1}$), parameter :: ${k1}$tol = 1000 * epsilon(1._${k1}$) + #:endfor + + #:for k1 in PCA_KINDS + call test_pca_${k1}$() + #:endfor + +contains + + #:for k1 in PCA_KINDS + subroutine test_pca_${k1}$() + real(${k1}$) :: x(3, 2), components(2, 2), s(2), mu(2) + real(${k1}$) :: x_trans(3, 2), x_inv(3, 2) + type(linalg_state_type) :: err + + ! Data: [1, 2], [3, 4], [5, 6] + x = reshape([1.0_${k1}$, 3.0_${k1}$, 5.0_${k1}$, 2.0_${k1}$, 4.0_${k1}$, 6.0_${k1}$], [3, 2]) + + ! Test SVD method + call pca(x, components, s, x_mean=mu, method="svd", err=err) + call check(err%ok(), "pca_${k1}$ svd err") + call check(all(abs(mu - [3.0_${k1}$, 4.0_${k1}$]) < ${k1}$tol), "pca_${k1}$ svd mean") + ! First component should be approx [0.707, 0.707] (or negative) + call check(abs(abs(components(1,1)) - 1.0_${k1}$/sqrt(2.0_${k1}$)) < ${k1}$tol, "pca_${k1}$ svd comp1") + call check(abs(s(1) - 4.0_${k1}$) < ${k1}$tol, "pca_${k1}$ svd s1") + call check(abs(s(2)) < ${k1}$tol, "pca_${k1}$ svd s2") + + ! Test Transform + call pca_transform(x, components, mu, x_trans) + ! Second dimension should be zero + call check(all(abs(x_trans(:, 2)) < ${k1}$tol), "pca_${k1}$ transform") + + ! Test Inverse Transform + call pca_inverse_transform(x_trans, components, mu, x_inv) + call check(all(abs(x_inv - x) < ${k1}$tol), "pca_${k1}$ inverse") + + ! Test EIG method + call pca(x, components, s, method="eig", err=err) + call check(err%ok(), "pca_${k1}$ eig err") + call check(abs(s(1) - 4.0_${k1}$) < ${k1}$tol, "pca_${k1}$ eig s1") + + end subroutine test_pca_${k1}$ + #:endfor + +end program test_pca