Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tseries class #294

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
18b1b36
have explicit source file for generic functions
kostrzewa Jul 28, 2020
2540be1
use .Rdata as extension in analysis_online
kostrzewa Jul 28, 2020
9475263
begin work on 'tseries' class
kostrzewa Jul 28, 2020
3d95596
vignette in which the new 'tseries' class is applied to study the gra…
kostrzewa Jul 28, 2020
eeaa93a
add 'tseries' vignette
kostrzewa Jul 28, 2020
4c88db7
In the analysis of online measurements, indicate M_ps extracted from …
kostrzewa Apr 7, 2021
02afcc0
store WIP of timeseries toolset and new implementation of gradient fl…
kostrzewa Jul 21, 2021
5bdf9d9
two digits for error on scale in gradient flow evolution plots
kostrzewa Jul 21, 2021
c87a928
print statement which shouldn't exist in readcmifiles
kostrzewa Dec 15, 2021
aa4cdab
add to parameters of as a way to have different starting points bet…
kostrzewa Jan 13, 2022
85f11a8
speed up readcmifiles by a factor of more than 100
kostrzewa Mar 5, 2022
ce93f04
Merge remote-tracking branch 'origin/master' into tseries
kostrzewa Mar 7, 2022
5aa3c81
resolve documentation warnings due to missing names in R/hadron-packa…
kostrzewa Mar 7, 2022
8550080
speed up gradient flow I/O by about a factor of 200
kostrzewa Mar 7, 2022
c26cbc2
remove 'skip' from analysis_online, replace this functionality for 'o…
kostrzewa Apr 20, 2022
73b407c
be more helpful when omeas.offset is used and no trajectories are found
kostrzewa Apr 20, 2022
597efe4
note dependency on reshape2
kostrzewa Jul 23, 2022
7d7d38f
account for dimesions of gradient flow reference data
kostrzewa Jul 23, 2022
9e8d133
add `trajectory_length` parameter for `analysis_online` which can be …
kostrzewa Jul 23, 2022
accc42c
in analysis_tmlqcd_gradient_flow, plot and summarize the topological …
kostrzewa Jul 26, 2022
b8690b9
draw error rectangles for mpcac plateau only if the error is not too …
kostrzewa Aug 20, 2022
e642956
introduce some restrictions on the plotting of error polygons in the …
kostrzewa Nov 11, 2022
7288fbc
add more documentation to 'analysis_tmlqcd_gradient_flow'
kostrzewa Dec 16, 2022
ade423b
typo in evals.stepsize
kostrzewa Jan 19, 2023
34272fc
Allow the definition point for gluonic scales (scale_definition) to b…
kostrzewa Jan 20, 2023
ed17427
parenthesis problem in analysis_online
kostrzewa Jan 21, 2023
cafec48
do not use the bootstrap samples to estimate the mean of mpi_ov_fpi
kostrzewa Nov 18, 2023
306fa22
in boostrap.nlsfit, make sure that x divides y
kostrzewa Feb 4, 2024
4a4809e
generalise the generator functions for CVC / meson_2pt keys to suppor…
kostrzewa Feb 4, 2024
3018fd4
nyom HDF5 files differ from CVC HDF5 files in that they use a compoun…
kostrzewa Feb 4, 2024
065c2b2
do not use 'sample', use 'istoch' instead
kostrzewa Feb 5, 2024
94922b3
remove unused dbg parameter
kostrzewa Feb 9, 2024
8824f0f
ensure that timeseries dat contains 'y' and 't'
kostrzewa Feb 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 68 additions & 59 deletions R/readutils.R
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,22 @@ getorderedconfignumbers <- function(path="./", basename="onlinemeas", last.digit
#' @export readcmifiles
readcmifiles <- function(files, excludelist=c(""), skip, verbose=FALSE,
colClasses, obs=NULL, obs.index, avg=1, stride=1) {

# use lapply as our default and switch to pbmclapply if available below
my_lapply <- lapply

pbmcapply_avail <- requireNamespace('pbmcapply')
if(pbmcapply_avail){
# when verbose output is desired, we disable the progress bar and run with a single thread
if(!verbose) {
my_lapply <- function(...){ pbmcapply::pbmclapply(ignore.interactive=TRUE, mc.preschedule=TRUE, ...) }
} else {
message("readcmifiles called with `verbose=TRUE`, I/O will be single-threaded!\n")
}
} else {
warning("pbmcapply package not found, I/O will be single-threaded!\n")
}

if(missing(files)) {
stop("readcmifiles: filelist missing, aborting...\n")
}
Expand All @@ -227,72 +243,65 @@ readcmifiles <- function(files, excludelist=c(""), skip, verbose=FALSE,
# when stride is > 1, we only read a subset of files
nFilesToLoad <- as.integer(nFiles/stride)
nCols <- length(tmpdata)
## we generate the full size data.frame first
tmpdata[,] <- NA
ldata <- tmpdata
ldata[((nFilesToLoad-1)*fLength+1):(nFilesToLoad*fLength),] <- tmpdata
# create a progress bar

pb <- NULL
if(!verbose) {
pb <- txtProgressBar(min = 0, max = nFiles, style = 3)
}
for(i in c(1:nFiles)) {
# update progress bar
if(!verbose) {
setTxtProgressBar(pb, i)
}
if( !(files[i] %in% excludelist) && file.exists(files[i]) && (i-1) %% stride == 0) {

if(verbose) {
message("Reading from file ", files[i], "\n")
}
## read the data
tmpdata <- read.table(files[i], colClasses=colClasses, skip=skip)
if(reduce) {
tmpdata <- tmpdata[tmpdata[,obs.index] %in% obs,]
}
## sanity checks
if(fLength < length(tmpdata$V1)) {
warning(paste("readcmifiles: file ", files[i], " does not have the same length as the other files. We will cut and hope...\n"))
}
else if(fLength > length(tmpdata$V1)) {
stop(paste("readcmifiles: file", files[i], "is too short. Aborting...\n"))
dat <- my_lapply(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@urbach if you're curious, this pattern of iterating in parallel using mclapply (or in this case pbmclapply) over the files and binding the results together with do.call(rbind,dat) in line 281 below is about a factor of 100 faster than building the data frame ahead of the time and replacing it bit by bit with the read data

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even single-threaded, it is many many times faster.

X=1:nFiles,
FUN=function(i){
if( !(files[i] %in% excludelist) && file.exists(files[i]) && (i-1) %% stride == 0) {

if(verbose) {
message("Reading from file ", files[i], "\n")
}
## read the data
tmpdata <- read.table(files[i], colClasses=colClasses, skip=skip)
if(reduce) {
tmpdata <- tmpdata[tmpdata[,obs.index] %in% obs,]
}
## sanity checks
if(fLength < length(tmpdata$V1)) {
warning(paste("readcmifiles: file ", files[i], " does not have the same length as the other files. We will cut and hope...\n"))
}
else if(fLength > length(tmpdata$V1)) {
stop(paste("readcmifiles: file", files[i], "is too short. Aborting...\n"))
}
if(nCols != length(tmpdata)) {
stop(paste("readcmifiles: file", files[i], "does not have the same number of columns as the other files. Aborting...\n"))
}

dat_idx_start <- ((i-1)/stride*fLength) + 1
dat_idx_stop <- dat_idx_start+fLength-1
return(tmpdata)
}
if(nCols != length(tmpdata)) {
stop(paste("readcmifiles: file", files[i], "does not have the same number of columns as the other files. Aborting...\n"))
else if(!file.exists(files[i])) {
stop(paste("readcmifiles: dropped file", files[i], "because it's missing\n"))
}

dat_idx_start <- ((i-1)/stride*fLength) + 1
dat_idx_stop <- dat_idx_start+fLength-1
ldata[dat_idx_start:dat_idx_stop,] <- tmpdata
rm(tmpdata)
}
else if(!file.exists(files[i])) {
warning(paste("readcmifiles: dropped file", files[i], "because it's missing\n"))
}
}
if(!verbose) {
close(pb)
}
})

# bind the data together
ldata <- do.call(rbind, dat)

# if we want to average over successive samples, we do this here
if(avg > 1){
for(i in seq(1,nFilesToLoad,by=avg)){
out_idx_start <- (i-1)*fLength + 1
out_idx_stop <- i*fLength
for(j in seq(i+1,i+avg-1)){
avg_idx_start <- (j-1)*fLength + 1
avg_idx_stop <- j*fLength
# add the next sample to the sample that we use as an output base
ldata[out_idx_start:out_idx_stop,] <- ldata[out_idx_start:out_idx_stop,] +
ldata[avg_idx_start:avg_idx_stop,]
# invalidate the sample that we just added
ldata[avg_idx_start:avg_idx_stop,] <- NA
dat <- my_lapply(
X=seq(1,nFilesToLoad,by=avg),
FUN=function(i){
out_idx_start <- (i-1)*fLength + 1
out_idx_stop <- i*fLength
tmpdata <- ldata[out_idx_start:out_idx_stop,]
for(j in seq(i+1,i+avg-1)){
avg_idx_start <- (j-1)*fLength + 1
avg_idx_stop <- j*fLength
# add the next sample to the sample that we use as an output base
tmpdata <- tmpdata +
ldata[avg_idx_start:avg_idx_stop,]
}
# take the average over the samples
tmpdata <- tmpdata/avg
return(tmpdata)
}
# take the average over the samples
ldata[out_idx_start:out_idx_stop,] <- ldata[out_idx_start:out_idx_stop,]/avg
}
)
# bind the data together
ldata <- do.call(rbind,dat)
}
## remove NAs from missing files
ldata <- na.omit(ldata)
Expand Down