Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/stats/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
set(stats_cppFiles
../stdlib_linalg_constants.fypp
../stdlib_sorting.fypp
../stdlib_sorting_ord_sort.fypp
../stdlib_sorting_sort_adjoint.fypp
../stdlib_sorting_sort.fypp
)

set(stats_fppFiles
Expand All @@ -10,12 +14,15 @@ set(stats_fppFiles
../stdlib_error.fypp
../stdlib_linalg.fypp
../stdlib_linalg_diag.fypp
../stdlib_linalg_eigenvalues.fypp
../stdlib_linalg_state.fypp
../stdlib_linalg_svd.fypp
../stdlib_random.fypp
../stdlib_selection.fypp
../stdlib_string_type.fypp
stdlib_stats_corr.fypp
stdlib_stats_cov.fypp
stdlib_stats_pca.fypp
stdlib_stats_distribution_exponential.fypp
stdlib_stats_distribution_normal.fypp
stdlib_stats_distribution_uniform.fypp
Expand All @@ -29,4 +36,9 @@ set(stats_fppFiles
stdlib_stats_var.fypp
)

set(f90Files
../stdlib_sorting_radix_sort.f90
)

configure_stdlib_target(stats "" stats_fppFiles stats_cppFiles)
target_link_libraries(stats PUBLIC blas lapack $<$<NOT:$<BOOL:${STDLIB_NO_BITSET}>>:bitsets>)
60 changes: 60 additions & 0 deletions src/stats/stdlib_stats.fypp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ module stdlib_stats
!! ([Specification](../page/specs/stdlib_stats.html))
use stdlib_kinds, only: sp, dp, xdp, qp, &
int8, int16, int32, int64
use stdlib_linalg_state, only: linalg_state_type
implicit none
private
! Public API
public :: corr, cov, mean, median, moment, var
public :: pca, pca_transform, pca_inverse_transform


interface corr
Expand Down Expand Up @@ -639,4 +641,62 @@ module stdlib_stats
#:endfor
end interface moment


#! Note: PCA is limited to single (sp) and double (dp) precision because external
#! optimized BLAS/LAPACK libraries (OpenBLAS, MKL) only support these precisions.
#! Extended (xdp) and quadruple (qp) precision are not supported for PCA.
#:set PCA_KINDS_TYPES = [("sp", "real(sp)"), ("dp", "real(dp)")]

interface pca
!! version: experimental
!!
!! Principal Component Analysis (PCA)
!! ([Specification](../page/specs/stdlib_stats.html#pca))
#:for k1, t1 in PCA_KINDS_TYPES
module subroutine pca_${k1}$(x, components, singular_values, x_mean, &
method, overwrite_x, err)
${t1}$, intent(inout), target :: x(:,:)
${t1}$, intent(out) :: components(:,:)
real(${k1}$), intent(out) :: singular_values(:)
${t1}$, intent(out), optional :: x_mean(:)
character(*), intent(in), optional :: method
logical, intent(in), optional :: overwrite_x
type(linalg_state_type), intent(out), optional :: err
end subroutine pca_${k1}$
#:endfor
end interface pca


interface pca_transform
!! version: experimental
!!
!! Projects data into the reduced dimensional space
!! ([Specification](../page/specs/stdlib_stats.html#pca_transform))
#:for k1, t1 in PCA_KINDS_TYPES
module subroutine pca_transform_${k1}$(x, components, x_mean, x_transformed)
${t1}$, intent(in) :: x(:,:)
${t1}$, intent(in) :: components(:,:)
${t1}$, intent(in), optional :: x_mean(:)
${t1}$, intent(out) :: x_transformed(:,:)
end subroutine pca_transform_${k1}$
#:endfor
end interface pca_transform


interface pca_inverse_transform
!! version: experimental
!!
!! Reconstructs original data from the reduced space
!! ([Specification](../page/specs/stdlib_stats.html#pca_inverse_transform))
#:for k1, t1 in PCA_KINDS_TYPES
module subroutine pca_inverse_transform_${k1}$(x_reduced, components, x_mean, x_reconstructed)
${t1}$, intent(in) :: x_reduced(:,:)
${t1}$, intent(in) :: components(:,:)
${t1}$, intent(in), optional :: x_mean(:)
${t1}$, intent(out) :: x_reconstructed(:,:)
end subroutine pca_inverse_transform_${k1}$
#:endfor
end interface pca_inverse_transform


end module stdlib_stats
191 changes: 191 additions & 0 deletions src/stats/stdlib_stats_pca.fypp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#:include "common.fypp"
#! Note: PCA is limited to single (sp) and double (dp) precision because external
#! optimized BLAS/LAPACK libraries (OpenBLAS, MKL) only support these precisions.
#:set PCA_KINDS_TYPES = [("sp", "real(sp)"), ("dp", "real(dp)")]
submodule (stdlib_stats) stdlib_stats_pca
use stdlib_kinds, only: sp, dp
use stdlib_error, only: error_stop
use stdlib_optval, only: optval
use stdlib_linalg, only: svd, eigh
use stdlib_linalg_constants, only: ilp
use stdlib_linalg_blas, only: gemm
use stdlib_linalg_state, only: linalg_state_type, LINALG_ERROR
use stdlib_sorting, only: sort_index
implicit none

contains

#:for k1, t1 in PCA_KINDS_TYPES
module subroutine pca_${k1}$(x, components, singular_values, x_mean, &
method, overwrite_x, err)
${t1}$, intent(inout), target :: x(:,:)
${t1}$, intent(out) :: components(:,:)
real(${k1}$), intent(out) :: singular_values(:)
${t1}$, intent(out), optional :: x_mean(:)
character(*), intent(in), optional :: method
logical, intent(in), optional :: overwrite_x
type(linalg_state_type), intent(out), optional :: err

type(linalg_state_type) :: err0
integer(ilp) :: n, p, i, j, k, m, n_s
${t1}$, allocatable :: mu(:)
character(16) :: method_

n = size(x, 1, kind=ilp)
p = size(x, 2, kind=ilp)
k = size(components, 1, kind=ilp)

method_ = optval(method, "svd")

! 1. Calculate mean using intrinsic sum (avoids submodule dependency issues)
allocate(mu(p))
do j = 1, p
mu(j) = sum(x(:, j)) / real(n, ${k1}$)
end do
if (present(x_mean)) x_mean = mu

if (method_ == "svd") then
! 2. Center data and call SVD with temporaries for robustness
block
${t1}$, allocatable :: s_tmp(:), vt_tmp(:,:)
n_s = min(n, p)
allocate(s_tmp(n_s), vt_tmp(n_s, p))

if (optval(overwrite_x, .false.)) then
do i = 1, n
x(i, :) = x(i, :) - mu
end do
call svd(x, s_tmp, vt=vt_tmp, overwrite_a=.true., full_matrices=.false., err=err0)
else
block
${t1}$, allocatable :: x_centered(:,:)
allocate(x_centered(n, p))
do i = 1, n
x_centered(i, :) = x(i, :) - mu
end do
call svd(x_centered, s_tmp, vt=vt_tmp, overwrite_a=.true., full_matrices=.false., err=err0)
end block
end if

if (err0%ok()) then
m = min(size(components, 1, kind=ilp), n_s)
components(:m, :) = vt_tmp(:m, :)
m = min(size(singular_values, 1, kind=ilp), n_s)
singular_values(:m) = s_tmp(:m)
end if
end block
else if (method_ == "eig" .or. method_ == "cov") then
! 3. Eigendecomposition of covariance matrix (computed inline)
block
${t1}$, allocatable :: c(:,:), vectors(:,:), x_centered(:,:)
real(${k1}$), allocatable :: lambda(:), lambda_copy(:)
integer(ilp), allocatable :: idx(:)
real(${k1}$) :: scale_factor

allocate(c(p, p), lambda(p), lambda_copy(p), idx(p), vectors(p, p))
allocate(x_centered(n, p))

! Center data
do i = 1, n
x_centered(i, :) = x(i, :) - mu
end do

! Compute covariance matrix: C = X^T * X / (n-1)
scale_factor = 1.0_${k1}$ / real(max(n-1, 1), ${k1}$)
do i = 1, p
do j = 1, p
c(i, j) = dot_product(x_centered(:, i), x_centered(:, j)) * scale_factor
end do
end do

call eigh(c, lambda, vectors=vectors, err=err0)

if (err0%ok()) then
! Sort eigenvalues in descending order using stdlib_sorting
! sort_index sorts in ascending order, so we negate values
lambda_copy = -lambda
call sort_index(lambda_copy, idx)

! Assign sorted results
m = min(size(components, 1, kind=ilp), p)
do i = 1, m
components(i, :) = vectors(:, idx(i))
if (lambda(idx(i)) > 0.0_${k1}$) then
singular_values(i) = sqrt(lambda(idx(i)) * real(n-1, ${k1}$))
else
singular_values(i) = 0.0_${k1}$
end if
end do
end if
end block
else
err0 = linalg_state_type("pca", LINALG_ERROR, "Unknown method: "//method_)
end if

! Handle error state: return error or stop if err not present
call err0%handle(err)

end subroutine pca_${k1}$
#:endfor


#:for k1, t1 in PCA_KINDS_TYPES
module subroutine pca_transform_${k1}$(x, components, x_mean, x_transformed)
${t1}$, intent(in) :: x(:,:)
${t1}$, intent(in) :: components(:,:)
${t1}$, intent(in), optional :: x_mean(:)
${t1}$, intent(out) :: x_transformed(:,:)

integer(ilp) :: i, n, p, nc
${t1}$, allocatable :: x_centered(:,:)
${t1}$, parameter :: alpha = 1.0_${k1}$, beta = 0.0_${k1}$

n = size(x, 1, kind=ilp)
p = size(x, 2, kind=ilp)
nc = size(components, 1, kind=ilp)

allocate(x_centered(n, p))
if (present(x_mean)) then
do i = 1, n
x_centered(i, :) = x(i, :) - x_mean
end do
else
x_centered = x
end if

! x_transformed = x_centered * components^T using GEMM
! GEMM: C = alpha * op(A) * op(B) + beta * C
! x_transformed(n, nc) = x_centered(n, p) * components(nc, p)^T
call gemm('N', 'T', n, nc, p, alpha, x_centered, n, components, nc, beta, x_transformed, n)
end subroutine pca_transform_${k1}$
#:endfor


#:for k1, t1 in PCA_KINDS_TYPES
module subroutine pca_inverse_transform_${k1}$(x_reduced, components, x_mean, x_reconstructed)
${t1}$, intent(in) :: x_reduced(:,:)
${t1}$, intent(in) :: components(:,:)
${t1}$, intent(in), optional :: x_mean(:)
${t1}$, intent(out) :: x_reconstructed(:,:)

integer(ilp) :: i, n, nc, p
${t1}$, parameter :: alpha = 1.0_${k1}$, beta = 0.0_${k1}$

n = size(x_reduced, 1, kind=ilp)
nc = size(x_reduced, 2, kind=ilp)
p = size(components, 2, kind=ilp)

! x_reconstructed = x_reduced * components using GEMM
! GEMM: C = alpha * op(A) * op(B) + beta * C
! x_reconstructed(n, p) = x_reduced(n, nc) * components(nc, p)
call gemm('N', 'N', n, p, nc, alpha, x_reduced, n, components, nc, beta, x_reconstructed, n)

if (present(x_mean)) then
do i = 1, n
x_reconstructed(i, :) = x_reconstructed(i, :) + x_mean
end do
end if
end subroutine pca_inverse_transform_${k1}$
#:endfor

end submodule stdlib_stats_pca
2 changes: 2 additions & 0 deletions test/stats/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ set(fppFiles
test_distribution_uniform.fypp
test_distribution_normal.fypp
test_distribution_exponential.fypp
test_pca.fypp
)

fypp_f90("${fyppFlags}" "${fppFiles}" outFiles)

ADDTEST(corr)
ADDTEST(cov)
ADDTEST(pca)
ADDTEST(mean)
ADDTEST(median)
ADDTEST(moment)
Expand Down
55 changes: 55 additions & 0 deletions test/stats/test_pca.fypp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#:include "common.fypp"
#:set PCA_KINDS = ["sp", "dp"]
program test_pca
use stdlib_error, only: check
use stdlib_kinds, only: sp, dp
use stdlib_stats, only: pca, pca_transform, pca_inverse_transform
use stdlib_linalg_state, only: linalg_state_type
implicit none

#:for k1 in PCA_KINDS
real(${k1}$), parameter :: ${k1}$tol = 1000 * epsilon(1._${k1}$)
#:endfor

#:for k1 in PCA_KINDS
call test_pca_${k1}$()
#:endfor

contains

#:for k1 in PCA_KINDS
subroutine test_pca_${k1}$()
real(${k1}$) :: x(3, 2), components(2, 2), s(2), mu(2)
real(${k1}$) :: x_trans(3, 2), x_inv(3, 2)
type(linalg_state_type) :: err

! Data: [1, 2], [3, 4], [5, 6]
x = reshape([1.0_${k1}$, 3.0_${k1}$, 5.0_${k1}$, 2.0_${k1}$, 4.0_${k1}$, 6.0_${k1}$], [3, 2])

! Test SVD method
call pca(x, components, s, x_mean=mu, method="svd", err=err)
call check(err%ok(), "pca_${k1}$ svd err")
call check(all(abs(mu - [3.0_${k1}$, 4.0_${k1}$]) < ${k1}$tol), "pca_${k1}$ svd mean")
! First component should be approx [0.707, 0.707] (or negative)
call check(abs(abs(components(1,1)) - 1.0_${k1}$/sqrt(2.0_${k1}$)) < ${k1}$tol, "pca_${k1}$ svd comp1")
call check(abs(s(1) - 4.0_${k1}$) < ${k1}$tol, "pca_${k1}$ svd s1")
call check(abs(s(2)) < ${k1}$tol, "pca_${k1}$ svd s2")

! Test Transform
call pca_transform(x, components, mu, x_trans)
! Second dimension should be zero
call check(all(abs(x_trans(:, 2)) < ${k1}$tol), "pca_${k1}$ transform")

! Test Inverse Transform
call pca_inverse_transform(x_trans, components, mu, x_inv)
call check(all(abs(x_inv - x) < ${k1}$tol), "pca_${k1}$ inverse")

! Test EIG method
call pca(x, components, s, method="eig", err=err)
call check(err%ok(), "pca_${k1}$ eig err")
call check(abs(s(1) - 4.0_${k1}$) < ${k1}$tol, "pca_${k1}$ eig s1")

end subroutine test_pca_${k1}$
#:endfor

end program test_pca
Loading