Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,22 @@ struct llama_mmap::impl {
mapped_fragments.emplace_back(0, file->size());
}

void prefetch(size_t offset, size_t len) const {

int page_size = sysconf(_SC_PAGESIZE);
size_t last = offset + len;
align_range(&offset, &last, page_size);
size_t aligned_len = last - offset;

int err = posix_madvise((void*)((uint8_t *)addr + offset),
aligned_len,
POSIX_MADV_WILLNEED);
if (err != 0) {
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
strerror(err));
}
}

static void align_range(size_t * first, size_t * last, size_t page_size) {
size_t offset_in_page = *first & (page_size - 1);
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
Expand Down Expand Up @@ -587,6 +603,7 @@ size_t llama_mmap::size() const { return pimpl->size; }
void * llama_mmap::addr() const { return pimpl->addr; }

void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
void llama_mmap::prefetch(size_t offset, size_t len) const { pimpl->prefetch(offset, len); }

#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
const bool llama_mmap::SUPPORTED = true;
Expand Down
1 change: 1 addition & 0 deletions src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ struct llama_mmap {
void * addr() const;

void unmap_fragment(size_t first, size_t last);
void prefetch(size_t offset, size_t len) const;

static const bool SUPPORTED;

Expand Down
26 changes: 11 additions & 15 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,21 +536,15 @@ llama_model_loader::llama_model_loader(
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));

files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);

if (use_mmap && use_direct_io) {
if (files.back()->has_direct_io()) {
// Disable mmap, as DirectIO is available
use_mmap = false;
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
} else {
// Disable DirectIO and reopen file using std::fopen for mmap
use_direct_io = false;
files.pop_back();
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
}
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));

if (use_direct_io && !files.back()->has_direct_io()) {
use_direct_io = false;
LLAMA_LOG_WARN("%s: direct I/O is not available, disabling\n", __func__);
files.pop_back();
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
}

// Save tensors data offset of the main file.
Expand Down Expand Up @@ -997,7 +991,7 @@ bool llama_model_loader::load_all_data(
std::vector<void *> host_ptrs;
size_t buffer_idx = 0; // buffer to use for async loads
ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
if (use_mmap || check_tensors) {
if (!use_direct_io || check_tensors) {
return nullptr;
}
// When not using mmaped io use async uploads from pinned memory to GPU memory.
Expand Down Expand Up @@ -1092,7 +1086,7 @@ bool llama_model_loader::load_all_data(

size_t n_size = ggml_nbytes(cur);

if (use_mmap) {
if (use_mmap && (!use_direct_io || !upload_backend || ggml_backend_buffer_is_host(cur->buffer))) {
const auto & mapping = mappings.at(weight->idx);
ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs.count(weight->idx)) {
Expand All @@ -1108,6 +1102,8 @@ bool llama_model_loader::load_all_data(

GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
if (buf_mmap && cur->data == nullptr) {
mapping->prefetch(weight->offs, n_size);

ggml_backend_tensor_alloc(buf_mmap, cur, data);
if (lmlocks) {
const auto & lmlock = lmlocks->at(weight->idx);
Expand Down
2 changes: 1 addition & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6985,7 +6985,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

ml.done_getting_tensors();

ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
ml.init_mappings(false, use_mlock ? &pimpl->mlock_mmaps : nullptr);
pimpl->mappings.reserve(ml.mappings.size());

// create the backend buffers
Expand Down