diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 0261e4c72c9..4b464ead2cb 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -448,6 +448,22 @@ struct llama_mmap::impl { mapped_fragments.emplace_back(0, file->size()); } + void prefetch(size_t offset, size_t len) const { + + int page_size = sysconf(_SC_PAGESIZE); + size_t last = offset + len; + align_range(&offset, &last, page_size); + size_t aligned_len = last - offset; + + int err = posix_madvise((void*)((uint8_t *)addr + offset), + aligned_len, + POSIX_MADV_WILLNEED); + if (err != 0) { + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + strerror(err)); + } + } + static void align_range(size_t * first, size_t * last, size_t page_size) { size_t offset_in_page = *first & (page_size - 1); size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; @@ -587,6 +603,7 @@ size_t llama_mmap::size() const { return pimpl->size; } void * llama_mmap::addr() const { return pimpl->addr; } void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); } +void llama_mmap::prefetch(size_t offset, size_t len) const { pimpl->prefetch(offset, len); } #if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32) const bool llama_mmap::SUPPORTED = true; diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 29ce4d24685..c5fbc70b11d 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -48,6 +48,7 @@ struct llama_mmap { void * addr() const; void unmap_fragment(size_t first, size_t last); + void prefetch(size_t offset, size_t len) const; static const bool SUPPORTED; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 383b8dc7618..4139673ef1a 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -536,21 +536,15 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); - if (use_mmap && use_direct_io) { - if (files.back()->has_direct_io()) { - // Disable mmap, as DirectIO is available - use_mmap = false; - LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); - } else { - // Disable DirectIO and reopen file using std::fopen for mmap - use_direct_io = false; - files.pop_back(); - files.emplace_back(new llama_file(fname.c_str(), "rb", false)); - LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); - } + files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); + + if (use_direct_io && !files.back()->has_direct_io()) { + use_direct_io = false; + LLAMA_LOG_WARN("%s: direct I/O is not available, disabling\n", __func__); + files.pop_back(); + files.emplace_back(new llama_file(fname.c_str(), "rb", false)); } // Save tensors data offset of the main file. @@ -997,7 +991,7 @@ bool llama_model_loader::load_all_data( std::vector host_ptrs; size_t buffer_idx = 0; // buffer to use for async loads ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { - if (use_mmap || check_tensors) { + if (!use_direct_io || check_tensors) { return nullptr; } // When not using mmaped io use async uploads from pinned memory to GPU memory. @@ -1092,7 +1086,7 @@ bool llama_model_loader::load_all_data( size_t n_size = ggml_nbytes(cur); - if (use_mmap) { + if (use_mmap && (!use_direct_io || !upload_backend || ggml_backend_buffer_is_host(cur->buffer))) { const auto & mapping = mappings.at(weight->idx); ggml_backend_buffer_t buf_mmap = nullptr; if (bufs.count(weight->idx)) { @@ -1108,6 +1102,8 @@ bool llama_model_loader::load_all_data( GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { + mapping->prefetch(weight->offs, n_size); + ggml_backend_tensor_alloc(buf_mmap, cur, data); if (lmlocks) { const auto & lmlock = lmlocks->at(weight->idx); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b58b35a4268..dfdfd8101e3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6985,7 +6985,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ml.done_getting_tensors(); - ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr); + ml.init_mappings(false, use_mlock ? &pimpl->mlock_mmaps : nullptr); pimpl->mappings.reserve(ml.mappings.size()); // create the backend buffers