From 592387fecbc6aa4fb1e8d1b9155e725d6ae336cf Mon Sep 17 00:00:00 2001 From: ak2k <19240940+ak2k@users.noreply.github.com> Date: Fri, 27 Jun 2025 12:11:24 -0400 Subject: [PATCH 1/5] Add ClientContext parameter to SQLiteDB::Open for future extensibility Also adds yum support for sqlite3 installation in CI. --- Makefile | 2 +- src/include/sqlite_db.hpp | 2 ++ src/sqlite_db.cpp | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e85d93b..fc85aba 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ include extension-ci-tools/makefiles/duckdb_extension.Makefile # Setup the sqlite3 tpch database data/db/tpch.db: release - command -v sqlite3 || (command -v brew && brew install sqlite) || (command -v choco && choco install sqlite -y) || (command -v apt-get && apt-get install -y sqlite3) || (command -v apk && apk add sqlite) || echo "no sqlite3" + command -v sqlite3 || (command -v brew && brew install sqlite) || (command -v choco && choco install sqlite -y) || (command -v apt-get && apt-get install -y sqlite3) || (command -v yum && yum install -y sqlite) || (command -v apk && apk add sqlite) || echo "no sqlite3" ./build/release/$(DUCKDB_PATH) < data/sql/tpch-export.duckdb || tree ./build/release || echo "neither tree not duck" sqlite3 data/db/tpch.db < data/sql/tpch-create.sqlite diff --git a/src/include/sqlite_db.hpp b/src/include/sqlite_db.hpp index 8c156b2..0312475 100644 --- a/src/include/sqlite_db.hpp +++ b/src/include/sqlite_db.hpp @@ -14,6 +14,7 @@ namespace duckdb { class SQLiteStatement; struct IndexInfo; +class ClientContext; class SQLiteDB { public: @@ -31,6 +32,7 @@ class SQLiteDB { public: static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); + static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared = false); bool TryPrepare(const string &query, SQLiteStatement &result); SQLiteStatement Prepare(const string &query); void Execute(const string &query); diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index 4a1f697..fa0061b 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -4,6 +4,7 @@ #include "duckdb/storage/table_storage_info.hpp" #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/parser.hpp" +#include "duckdb/main/client_context.hpp" #include "sqlite_db.hpp" #include "sqlite_stmt.hpp" @@ -65,6 +66,10 @@ SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bo return result; } +SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared) { + return Open(path, options, is_shared); +} + bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { stmt.db = db; if (debug_sqlite_print_queries) { From 4865677020554f64537080d84b50e6b55d7eed9b Mon Sep 17 00:00:00 2001 From: ak2k <19240940+ak2k@users.noreply.github.com> Date: Fri, 27 Jun 2025 14:41:11 -0400 Subject: [PATCH 2/5] Fix potential deadlock when opening remote SQLite databases Opening SQLite connections for remote files can trigger HTTP requests while the MetaTransaction lock is held, potentially causing deadlocks. This change defers connection initialization until first use, with thread-safe lazy initialization using atomic flags and per-database mutexes. --- src/include/storage/sqlite_catalog.hpp | 4 +- src/include/storage/sqlite_transaction.hpp | 6 ++ src/storage/sqlite_catalog.cpp | 18 +++- src/storage/sqlite_transaction.cpp | 97 ++++++++++++++++++---- 4 files changed, 103 insertions(+), 22 deletions(-) diff --git a/src/include/storage/sqlite_catalog.hpp b/src/include/storage/sqlite_catalog.hpp index 0bf1ca2..bbc83cf 100644 --- a/src/include/storage/sqlite_catalog.hpp +++ b/src/include/storage/sqlite_catalog.hpp @@ -59,7 +59,7 @@ class SQLiteCatalog : public Catalog { string GetDBPath() override; //! Returns a reference to the in-memory database (if any) - SQLiteDB *GetInMemoryDatabase(); + SQLiteDB *GetInMemoryDatabase(ClientContext &context); //! Release the in-memory database (if there is any) void ReleaseInMemoryDatabase(); @@ -76,6 +76,8 @@ class SQLiteCatalog : public Catalog { mutex in_memory_lock; //! Whether or not there is any active transaction on the in-memory database bool active_in_memory; + //! Whether the in-memory database has been initialized + bool in_memory_db_initialized; }; } // namespace duckdb diff --git a/src/include/storage/sqlite_transaction.hpp b/src/include/storage/sqlite_transaction.hpp index dd7095d..8892688 100644 --- a/src/include/storage/sqlite_transaction.hpp +++ b/src/include/storage/sqlite_transaction.hpp @@ -10,6 +10,8 @@ #include "duckdb/transaction/transaction.hpp" #include "duckdb/common/case_insensitive_map.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/atomic.hpp" #include "sqlite_db.hpp" namespace duckdb { @@ -37,6 +39,10 @@ class SQLiteTransaction : public Transaction { SQLiteDB *db; SQLiteDB owned_db; case_insensitive_map_t> catalog_entries; + + // Atomic flags for thread-safe initialization + atomic started{false}; + atomic db_initialized{false}; }; } // namespace duckdb diff --git a/src/storage/sqlite_catalog.cpp b/src/storage/sqlite_catalog.cpp index a228510..5f53d6c 100644 --- a/src/storage/sqlite_catalog.cpp +++ b/src/storage/sqlite_catalog.cpp @@ -5,14 +5,17 @@ #include "storage/sqlite_schema_entry.hpp" #include "storage/sqlite_transaction.hpp" #include "duckdb/common/exception/transaction_exception.hpp" +#include "duckdb/common/limits.hpp" namespace duckdb { SQLiteCatalog::SQLiteCatalog(AttachedDatabase &db_p, const string &path, SQLiteOpenOptions options_p) - : Catalog(db_p), path(path), options(std::move(options_p)), in_memory(path == ":memory:"), active_in_memory(false) { - if (InMemory()) { - in_memory_db = SQLiteDB::Open(path, options, true); + : Catalog(db_p), path(path), options(std::move(options_p)), in_memory(path == ":memory:"), active_in_memory(false), in_memory_db_initialized(false) { + if (options.busy_timeout > 0 && options.busy_timeout > NumericLimits::Maximum()) { + throw std::runtime_error("busy_timeout out of range - must be within " + "valid range for type int"); } + // In-memory database is now opened lazily in GetInMemoryDatabase to support deferred initialization } SQLiteCatalog::~SQLiteCatalog() { @@ -52,11 +55,18 @@ string SQLiteCatalog::GetDBPath() { return path; } -SQLiteDB *SQLiteCatalog::GetInMemoryDatabase() { +SQLiteDB *SQLiteCatalog::GetInMemoryDatabase(ClientContext &context) { if (!InMemory()) { throw InternalException("GetInMemoryDatabase() called on a non-in-memory database"); } lock_guard l(in_memory_lock); + + // Initialize the in-memory database on first access + if (!in_memory_db_initialized) { + in_memory_db = SQLiteDB::Open(path, options, context, true); + in_memory_db_initialized = true; + } + if (active_in_memory) { throw TransactionException("Only a single transaction can be active on an " "in-memory SQLite database at a time"); diff --git a/src/storage/sqlite_transaction.cpp b/src/storage/sqlite_transaction.cpp index 4dc0296..a6c17a3 100644 --- a/src/storage/sqlite_transaction.cpp +++ b/src/storage/sqlite_transaction.cpp @@ -15,16 +15,42 @@ namespace duckdb { -SQLiteTransaction::SQLiteTransaction(SQLiteCatalog &sqlite_catalog, TransactionManager &manager, ClientContext &context) - : Transaction(manager, context), sqlite_catalog(sqlite_catalog) { - if (sqlite_catalog.InMemory()) { - // in-memory database - get a reference to the in-memory connection - db = sqlite_catalog.GetInMemoryDatabase(); - } else { - // on-disk database - open a new database connection - owned_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, true); - db = &owned_db; +//===--------------------------------------------------------------------===// +// Per-Database Mutex Registry +//===--------------------------------------------------------------------===// +// Instead of a global mutex, we use per-database mutexes to reduce contention. +// This allows concurrent access to different databases while maintaining +// thread safety for each individual database. +//===--------------------------------------------------------------------===// + +struct DatabaseMutexRegistry { + mutex registry_mutex; // Protects the registry itself + unordered_map> database_mutexes; + + mutex& GetDatabaseMutex(const string &path) { + lock_guard lock(registry_mutex); + auto it = database_mutexes.find(path); + if (it == database_mutexes.end()) { + database_mutexes[path] = make_uniq(); + return *database_mutexes[path]; + } + return *it->second; } +}; + +static DatabaseMutexRegistry& GetMutexRegistry() { + // Function-local static ensures thread-safe initialization + static DatabaseMutexRegistry registry; + return registry; +} + +SQLiteTransaction::SQLiteTransaction(SQLiteCatalog &sqlite_catalog, TransactionManager &manager, ClientContext &context) + : Transaction(manager, context), sqlite_catalog(sqlite_catalog), db(nullptr) { + + // Database connection and transaction start are deferred to prevent potential deadlocks. + // Opening SQLite connections for remote files can trigger HTTP requests and caching + // operations while the MetaTransaction lock is held, which could cause deadlocks. + // Both connection and transaction are initialized lazily in GetDB() when first needed. } SQLiteTransaction::~SQLiteTransaction() { @@ -32,16 +58,53 @@ SQLiteTransaction::~SQLiteTransaction() { } void SQLiteTransaction::Start() { - db->Execute("BEGIN TRANSACTION"); + + if (!started.load(std::memory_order_acquire)) { + GetDB(); // This will handle both connection and transaction start + } } void SQLiteTransaction::Commit() { - db->Execute("COMMIT"); + GetDB().Execute("COMMIT"); } void SQLiteTransaction::Rollback() { - db->Execute("ROLLBACK"); + GetDB().Execute("ROLLBACK"); } SQLiteDB &SQLiteTransaction::GetDB() { + // Fast path: check if already initialized (with memory ordering) + if (db_initialized.load(std::memory_order_acquire)) { + return *db; + } + + // Slow path: need to initialize + // Get per-database mutex to reduce contention + auto &database_mutex = GetMutexRegistry().GetDatabaseMutex(sqlite_catalog.path); + lock_guard lock(database_mutex); + + // Check again after acquiring lock (double-checked locking with proper atomics) + if (!db_initialized.load(std::memory_order_relaxed)) { + // Initialize database connection + if (!db) { + if (sqlite_catalog.InMemory()) { + // in-memory database - get a reference to the in-memory connection + db = sqlite_catalog.GetInMemoryDatabase(*context.lock()); + } else { + // on-disk/remote database - open a new database connection + owned_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, *context.lock(), true); + db = &owned_db; + } + } + + // Start transaction if not already started + if (!started.load(std::memory_order_relaxed)) { + db->Execute("BEGIN TRANSACTION"); + started.store(true, std::memory_order_relaxed); + } + + // Mark as initialized with release semantics to ensure all writes are visible + db_initialized.store(true, std::memory_order_release); + } + return *db; } @@ -111,7 +174,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr return entry->second.get(); } // catalog entry not found - look up table in main SQLite database - auto type = db->GetEntryType(entry_name); + auto type = GetDB().GetEntryType(entry_name); if (type == CatalogType::INVALID) { // no table or view found return nullptr; @@ -125,7 +188,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr if (context.lock()->TryGetCurrentSetting("sqlite_all_varchar", sqlite_all_varchar)) { all_varchar = BooleanValue::Get(sqlite_all_varchar); } - db->GetTableInfo(entry_name, info.columns, info.constraints, all_varchar); + GetDB().GetTableInfo(entry_name, info.columns, info.constraints, all_varchar); D_ASSERT(!info.columns.empty()); result = make_uniq(sqlite_catalog, sqlite_catalog.GetMainSchema(), info, all_varchar); @@ -133,7 +196,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr } case CatalogType::VIEW_ENTRY: { string sql; - db->GetViewInfo(entry_name, sql); + GetDB().GetViewInfo(entry_name, sql); unique_ptr view_info; try { @@ -153,7 +216,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr case CatalogType::INDEX_ENTRY: { string table_name; string sql; - db->GetIndexInfo(entry_name, sql, table_name); + GetDB().GetIndexInfo(entry_name, sql, table_name); if (sql.empty()) { throw InternalException("SQL is empty"); } @@ -200,7 +263,7 @@ string GetDropSQL(CatalogType type, const string &table_name, bool cascade) { void SQLiteTransaction::DropEntry(CatalogType type, const string &table_name, bool cascade) { catalog_entries.erase(table_name); - db->Execute(GetDropSQL(type, table_name, cascade)); + GetDB().Execute(GetDropSQL(type, table_name, cascade)); } } // namespace duckdb From 818139d58d49531f25b97c18b133b75025a48b35 Mon Sep 17 00:00:00 2001 From: ak2k <19240940+ak2k@users.noreply.github.com> Date: Fri, 27 Jun 2025 14:41:30 -0400 Subject: [PATCH 3/5] Add SQLite VFS implementation for remote file support --- src/CMakeLists.txt | 2 +- src/include/sqlite_duckdb_vfs_cache.hpp | 114 +++ src/sqlite_duckdb_vfs_cache.cpp | 886 ++++++++++++++++++ .../http_sqlite_00_vfs_registration.test | 52 + 4 files changed, 1053 insertions(+), 1 deletion(-) create mode 100644 src/include/sqlite_duckdb_vfs_cache.hpp create mode 100644 src/sqlite_duckdb_vfs_cache.cpp create mode 100644 test/sql/scanner/http_sqlite_00_vfs_registration.test diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f514eca..3ce9f4f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(storage) add_library( sqlite_ext_library OBJECT - sqlite_db.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp + sqlite_db.cpp sqlite_duckdb_vfs_cache.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp sqlite_storage.cpp sqlite_utils.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ diff --git a/src/include/sqlite_duckdb_vfs_cache.hpp b/src/include/sqlite_duckdb_vfs_cache.hpp new file mode 100644 index 0000000..71cf603 --- /dev/null +++ b/src/include/sqlite_duckdb_vfs_cache.hpp @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// sqlite_duckdb_vfs_cache.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/storage/buffer/buffer_handle.hpp" +#include "duckdb/storage/buffer_manager.hpp" +#include "duckdb/storage/caching_file_system.hpp" + +#include "sqlite3.h" + +namespace duckdb { + +class ClientContext; + +// Wrapper around DuckDB's CachingFileSystem for remote SQLite file access. +// Uses DuckDB's caching infrastructure to efficiently handle remote file I/O. +class DuckDBCachedFile { +public: + DuckDBCachedFile(ClientContext &context, const string &path); + ~DuckDBCachedFile(); + + // Read data from the file at the specified offset + int Read(void *buffer, int amount, sqlite3_int64 offset); + // Get the cached file size + sqlite3_int64 get_file_size(); + +private: + // Lazy initialization - defer DuckDB operations until first use + void ensure_initialized(); + + // Adaptive read-ahead constants + static constexpr idx_t MIN_READAHEAD_SIZE = static_cast(1024) * 1024; // 1MB + static constexpr idx_t MAX_READAHEAD_SIZE = static_cast(128) * 1024 * 1024; // 128MB + static constexpr idx_t SEQUENTIAL_THRESHOLD = static_cast(64) * 1024; // 64KB gap tolerance + + ClientContext &context; + const string path; + unique_ptr caching_handle; + bool initialized = false; + + // Adaptive read-ahead state + sqlite3_int64 last_read_offset = -1; // Track last read position + sqlite3_int64 last_read_end = -1; // End of last read (offset + amount) + idx_t current_readahead_size = MIN_READAHEAD_SIZE; // Current read-ahead block size + + // Helper methods for adaptive read-ahead + idx_t calculate_read_ahead_size(sqlite3_int64 offset, int amount) const; + bool is_sequential_read(sqlite3_int64 offset) const; + void update_read_ahead_state(sqlite3_int64 offset, int amount); +}; + +// SQLite Virtual File System (VFS) implementation that uses DuckDB's +// CachingFileSystem for efficient remote SQLite database access. +class SQLiteDuckDBCacheVFS { +public: + // Register the VFS with SQLite (thread-safe, idempotent) + static void Register(ClientContext &context); + // Unregister the VFS when context is destroyed + static void Unregister(ClientContext &context); + // Check if this path should be handled by our VFS (i.e., is it remote?) + static bool CanHandlePath(ClientContext &context, const string &path); + // Get the VFS registration name for a context + static const char *GetVFSNameForContext(ClientContext &context); + // Get the default VFS registration name (for compatibility) + static const char *GetVFSName() { return "duckdb_cache_fs"; } + + // SQLite VFS interface methods (must be public for C callback registration) + static int Open(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags); + static int Delete(sqlite3_vfs *vfs, const char *filename, int sync_dir); + static int Access(sqlite3_vfs *vfs, const char *filename, int flags, int *result); + static int FullPathname(sqlite3_vfs *vfs, const char *filename, int out_size, char *out_buf); + static void *DlOpen(sqlite3_vfs *vfs, const char *filename); + static void DlError(sqlite3_vfs *vfs, int bytes, char *err_msg); + static void (*DlSym(sqlite3_vfs *vfs, void *handle, const char *symbol))(void); + static void DlClose(sqlite3_vfs *vfs, void *handle); + static int Randomness(sqlite3_vfs *vfs, int bytes, char *out); + static int Sleep(sqlite3_vfs *vfs, int microseconds); + static int CurrentTime(sqlite3_vfs *vfs, double *time); + static int GetLastError(sqlite3_vfs *vfs, int bytes, char *err_msg); + + // SQLite file I/O methods (must be public for C callback registration) + static int Close(sqlite3_file *file); + static int Read(sqlite3_file *file, void *buffer, int amount, sqlite3_int64 offset); + static int Write(sqlite3_file *file, const void *buffer, int amount, sqlite3_int64 offset); + static int Truncate(sqlite3_file *file, sqlite3_int64 size); + static int Sync(sqlite3_file *file, int flags); + static int FileSize(sqlite3_file *file, sqlite3_int64 *size); + static int Lock(sqlite3_file *file, int level); + static int Unlock(sqlite3_file *file, int level); + static int CheckReservedLock(sqlite3_file *file, int *result); + static int FileControl(sqlite3_file *file, int op, void *arg); + static int SectorSize(sqlite3_file *file); + static int DeviceCharacteristics(sqlite3_file *file); +}; + +// SQLite file handle structure that wraps our DuckDBCachedFile. +// Memory layout must be compatible with SQLite's expectations. +// This structure is allocated by SQLite and may cross module boundaries. +// We use raw pointers with explicit ownership rules to avoid DLL issues. +struct SQLiteDuckDBCachedFile { + sqlite3_file base; // Must be first member for C compatibility + DuckDBCachedFile *duckdb_file; // Raw pointer - explicitly deleted in Close() +}; + +} // namespace duckdb \ No newline at end of file diff --git a/src/sqlite_duckdb_vfs_cache.cpp b/src/sqlite_duckdb_vfs_cache.cpp new file mode 100644 index 0000000..eb185b0 --- /dev/null +++ b/src/sqlite_duckdb_vfs_cache.cpp @@ -0,0 +1,886 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// sqlite_duckdb_vfs_cache.cpp +// +// +//===----------------------------------------------------------------------===// + +#include "sqlite_duckdb_vfs_cache.hpp" + +#include "duckdb/common/exception.hpp" +#include "duckdb/common/exception/http_exception.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/string_util.hpp" +#include "duckdb/common/unordered_map.hpp" +#include "duckdb/common/operator/cast_operators.hpp" +#include "duckdb/common/re2_regex.hpp" +#include "duckdb/main/client_context.hpp" +#include "duckdb/main/database.hpp" +#include "duckdb/storage/buffer_manager.hpp" +#include "duckdb/common/atomic.hpp" + +#include + +namespace duckdb { + +//===--------------------------------------------------------------------===// +// Concurrency Design +//===--------------------------------------------------------------------===// +// This VFS implementation is designed for safe concurrent access: +// +// VFS Registry (mutex-protected): +// - Each ClientContext gets its own VFS instance with unique name +// - Registration/unregistration operations are thread-safe +// - Mutex protects registry map operations, not file I/O +// +// File Operations (lock-free): +// - Each VFS instance is independent with no shared mutable state +// - File handles contain their own ClientContext pointer +// - DuckDB's CachingFileSystem provides internal synchronization +// +// Lifetime Management: +// - ClientContext MUST outlive all SQLite connections using its VFS +// - VFS automatically unregistered when ClientContext is destroyed +// - Multiple VFS instances share the same ExternalFileCache at DatabaseInstance level +// - Cache sharing is thread-safe through DuckDB's internal locking mechanisms +//===--------------------------------------------------------------------===// + +// Per-context VFS instances for thread safety. +// vfs_name uses sqlite3_malloc for cross-DLL safety. +struct DuckDBVFSWrapper { + sqlite3_vfs base; // Must be first - SQLite VFS structure + ClientContext *context; // The DuckDB context for this VFS + char *vfs_name; // Unique name for this VFS instance (allocated via sqlite3_malloc) + sqlite3_io_methods io_methods; // IO methods for this VFS instance + + // Error context storage for better diagnostics + mutable mutex error_mutex; + string last_error_message; + + ~DuckDBVFSWrapper() noexcept { + // Clean up using SQLite's allocator to match sqlite3_malloc + if (vfs_name) { + sqlite3_free(vfs_name); + vfs_name = nullptr; + } + } + + void set_last_error(const string &error) { + lock_guard lock(error_mutex); + last_error_message = error; + } + + string get_last_error() const { + lock_guard lock(error_mutex); + return last_error_message; + } +}; + +//===--------------------------------------------------------------------===// +// C/C++ Boundary Safety +//===--------------------------------------------------------------------===// +// Template to safely execute C++ code from SQLite's C callbacks. +// Catches all exceptions and converts them to appropriate SQLite error codes. +// This is critical because SQLite is written in C and cannot handle C++ exceptions. +// +// Usage: +// return SafeVFSCall(SQLITE_IOERR, [&]() { +// // C++ code that might throw +// return SQLITE_OK; +// }); +//===--------------------------------------------------------------------===// + +// Forward declaration +struct DuckDBVFSWrapper; + +//===--------------------------------------------------------------------===// +// HTTP Error Mapping +//===--------------------------------------------------------------------===// + +// HTTP error patterns for detection - compile-time constants +static constexpr const char* HTTP_ERROR_PATTERNS[] = { + "\"exception_type\":\"HTTP\"", + "\"exception_type\":\"IO\"", + "404 (Not Found)", + "403 (Forbidden)", + "401 (Unauthorized)", + "500 (Internal Server Error)", + "502 (Bad Gateway)", + "503 (Service Unavailable)", + "Unable to connect to URL", + "Could not establish connection", + "HTTP HEAD to", + "HTTP GET to" +}; + +// Regex-based HTTP status code extraction +static int extract_http_status(const string &error_msg) { + // Comprehensive regex pattern for all HTTP status code formats: + // Group 1: "status_code":"XXX" (JSON) + // Group 2: XXX (Description) (httpfs format) + // Group 3: (HTTP XXX) or HTTP code XXX or HTTP XXX + static duckdb_re2::Regex status_regex( + "\"status_code\":\"(\\d{3})\"|" // JSON format + "(\\d{3})\\s*\\([^)]+\\)|" // "404 (Not Found)" + "\\(?HTTP\\s+(?:code\\s+)?(\\d{3})\\)?" // "(HTTP 404)", "HTTP code 403", "HTTP 500" + ); + + duckdb_re2::Match match; + if (duckdb_re2::RegexSearch(error_msg, match, status_regex)) { + // Check which group captured the status code (groups are 1-indexed) + for (idx_t i = 1; i < match.groups.size(); i++) { + if (!match.groups[i].text.empty()) { + int32_t result; + if (TryCast::Operation(string_t(match.groups[i].text), result)) { + return result; + } + } + } + } + + return 0; // No status code found +} + +// HTTP error detection +static bool is_http_error(const string &error_msg) { + // Use std::any_of with StringUtil::Contains for cleaner, more efficient checking + return std::any_of(std::begin(HTTP_ERROR_PATTERNS), std::end(HTTP_ERROR_PATTERNS), + [&error_msg](const char* pattern) { + return StringUtil::Contains(error_msg, pattern); + }); +} + +// Map HTTP status code to SQLite error code +static int http_status_to_sqlite_error(int http_status) { + switch (http_status) { + case 404: + return SQLITE_CANTOPEN; // SQLite will interpret as "unable to open database file" + case 401: // Unauthorized (auth required) + case 403: + return SQLITE_PERM; // "permission denied" + case 408: + return SQLITE_IOERR_ACCESS; // Request timeout + case 429: + return SQLITE_BUSY; // Too many requests + default: + if (http_status >= 500 && http_status < 600) { + return SQLITE_IOERR; // Server errors + } + return 0; // Unknown/unmapped status + } +} + +template +static T SafeVFSCall(T error_value, Func&& func, DuckDBVFSWrapper *wrapper = nullptr, const char *path = nullptr, const char *method = nullptr) { + try { + return func(); + } catch (const std::exception &e) { + string error_msg = e.what(); + // fprintf(stderr, "DEBUG VFS: Exception caught in %s: %.200s\n", method ? method : "unknown", error_msg.c_str()); + // fprintf(stderr, "DEBUG VFS: IsHTTPError result: %s\n", IsHTTPError(error_msg) ? "true" : "false"); + + // Check if this is an HTTP error + if (is_http_error(error_msg)) { + // Store HTTP error context + if (wrapper) { + string full_error = "HTTP Error: "; + full_error += error_msg; + if (path) { + full_error += " (URL: "; + full_error += path; + full_error += ")"; + } + wrapper->set_last_error(full_error); + } + + // Try to map HTTP status to specific SQLite error + int http_status = extract_http_status(error_msg); + int sqlite_error = http_status_to_sqlite_error(http_status); + // Debug (uncomment for troubleshooting) + // fprintf(stderr, "DEBUG: Error message (first 200 chars): %.200s\n", error_msg.c_str()); + // fprintf(stderr, "DEBUG: HTTP Status: %d, SQLite Error: %d\n", http_status, sqlite_error); + if (sqlite_error != 0) { + return sqlite_error; + } + + // Special case: Network connection failures should be treated as "unable to open" + if (error_msg.find("Unable to connect to URL") != string::npos || + error_msg.find("Could not establish connection") != string::npos) { + return error_value == SQLITE_OK ? SQLITE_CANTOPEN : error_value; + } + + // Default for unmapped HTTP errors (server errors, etc.) + return error_value == SQLITE_OK ? SQLITE_IOERR : error_value; + } + + // Check for specific DuckDB exception types in the message + if (error_msg.find("Permission") != string::npos) { + if (wrapper) { + string full_error = "Permission denied: "; + full_error += error_msg; + if (path) { + full_error += " (Path: "; + full_error += path; + full_error += ")"; + } + wrapper->set_last_error(full_error); + } + return error_value == SQLITE_OK ? SQLITE_PERM : error_value; + } + + // Store generic error context + if (wrapper) { + string full_error = "Error: "; + full_error += error_msg; + if (path) { + full_error += " (Path: "; + full_error += path; + full_error += ")"; + } + wrapper->set_last_error(full_error); + } + + // Default error handling + return error_value; + } catch (...) { + // Unknown exception + if (wrapper) { + wrapper->set_last_error("Unknown error occurred"); + } + return error_value; + } +} + +// Global registry of VFS wrappers to manage their lifetime +// Use function-local statics to ensure proper initialization order on Windows +struct VFSRegistryData { + mutex registry_mutex; + unordered_map> registry; +}; + +static VFSRegistryData& GetVFSRegistryData() { + // Function-local static ensures thread-safe initialization + static VFSRegistryData data; + return data; +} + +// Sector size: minimum atomic write unit for the storage device +static constexpr int SQLITE_SECTOR_SIZE = 4096; + +// Initialize IO methods for a VFS wrapper +static void InitializeIOMethods(sqlite3_io_methods &io_methods) { + memset(&io_methods, 0, sizeof(io_methods)); + + io_methods.iVersion = 1; + io_methods.xClose = SQLiteDuckDBCacheVFS::Close; + io_methods.xRead = SQLiteDuckDBCacheVFS::Read; + io_methods.xWrite = SQLiteDuckDBCacheVFS::Write; + io_methods.xTruncate = SQLiteDuckDBCacheVFS::Truncate; + io_methods.xSync = SQLiteDuckDBCacheVFS::Sync; + io_methods.xFileSize = SQLiteDuckDBCacheVFS::FileSize; + io_methods.xLock = SQLiteDuckDBCacheVFS::Lock; + io_methods.xUnlock = SQLiteDuckDBCacheVFS::Unlock; + io_methods.xCheckReservedLock = SQLiteDuckDBCacheVFS::CheckReservedLock; + io_methods.xFileControl = SQLiteDuckDBCacheVFS::FileControl; + io_methods.xSectorSize = SQLiteDuckDBCacheVFS::SectorSize; + io_methods.xDeviceCharacteristics = SQLiteDuckDBCacheVFS::DeviceCharacteristics; + // Shared memory methods not needed for read-only remote files + io_methods.xShmMap = nullptr; + io_methods.xShmLock = nullptr; + io_methods.xShmBarrier = nullptr; + io_methods.xShmUnmap = nullptr; + io_methods.xFetch = nullptr; + io_methods.xUnfetch = nullptr; +} + +// Get the unique VFS name for a ClientContext +static string get_unique_vfs_name(const ClientContext *context) { + static atomic vfs_counter{0}; + return "duckdb_cache_vfs_" + to_string(vfs_counter.fetch_add(1)); +} + +//===--------------------------------------------------------------------===// +// DuckDBCachedFile Implementation +//===--------------------------------------------------------------------===// + +DuckDBCachedFile::DuckDBCachedFile(ClientContext &context, const string &path) + : context(context), path(path) { + // Defer actual file opening until first use to avoid doing DuckDB operations + // during SQLite VFS callbacks, which might be in a different serialization context +} + +DuckDBCachedFile::~DuckDBCachedFile() { + // Ensure proper cleanup of the caching handle + // The unique_ptr will automatically release the handle, + // but we add this explicit destructor for clarity and + // to enable future debugging/validation if needed + if (caching_handle) { + // Reset explicitly to ensure deterministic cleanup order + caching_handle.reset(); + } +} + +void DuckDBCachedFile::ensure_initialized() { + if (initialized) { + return; + } + + // Configure file flags for optimal caching behavior. + // Remote files use DIRECT_IO to bypass OS caching since DuckDB's + // CachingFileSystem provides its own intelligent block caching. + auto flags = FileFlags::FILE_FLAGS_READ; + if (FileSystem::IsRemoteFile(path)) { + flags |= FileFlags::FILE_FLAGS_DIRECT_IO; + } + + // Open the file through DuckDB's CachingFileSystem. + // The CachingFileSystem provides efficient caching of remote file data, + // though the actual read patterns are determined by our Read implementation. + auto caching_fs = CachingFileSystem::Get(context); + OpenFileInfo file_info(path); + // fprintf(stderr, "DEBUG: About to open file: %s\n", path.c_str()); + caching_handle = caching_fs.OpenFile(file_info, flags); + + + initialized = true; +} + + +int DuckDBCachedFile::Read(void *buffer, int amount, sqlite3_int64 offset) { + // Validate inputs to prevent integer overflow attacks + if (offset < 0 || amount < 0) { + return SQLITE_IOERR_READ; + } + + // Early return for empty reads (SQLite sometimes requests 0 bytes) + if (!buffer || amount == 0) { + return SQLITE_OK; + } + + // Ensure we're initialized before first read + // Let exceptions propagate to SafeVFSCall for unified error handling + ensure_initialized(); + + // Safety check - should never happen in normal operation + if (!caching_handle) { + return SQLITE_IOERR_READ; + } + + // Get current file size from DuckDB (handles validation/caching) + const sqlite3_int64 file_size = static_cast(caching_handle->GetFileSize()); + + // Check if we're reading past EOF + if (offset >= file_size) { + // Reading completely past EOF - zero-fill entire buffer + memset(buffer, 0, amount); + return SQLITE_IOERR_SHORT_READ; + } + + // Calculate how many bytes we can actually read + const sqlite3_int64 available_bytes = file_size - offset; + const int bytes_to_read = (available_bytes < amount) ? static_cast(available_bytes) : amount; + + // Calculate optimal read-ahead size based on access pattern + const idx_t readahead_size = calculate_read_ahead_size(offset, bytes_to_read); + + // Ensure we read at least the requested amount (up to EOF) + idx_t actual_read_size = MaxValue(static_cast(bytes_to_read), readahead_size); + + // Don't read beyond file end (use safe arithmetic to prevent overflow) + // We already know offset < file_size from the check above + const sqlite3_int64 remaining_bytes = file_size - offset; + if (static_cast(actual_read_size) > remaining_bytes) { + actual_read_size = static_cast(remaining_bytes); + } + + // Use DuckDB's CachingFileSystem with adaptive read-ahead + data_ptr_t read_buffer = nullptr; + auto buffer_handle = caching_handle->Read(read_buffer, actual_read_size, offset); + + // Validate read buffer before copying + if (!read_buffer) { + return SQLITE_IOERR_READ; + } + + // Copy the data we read + memcpy(buffer, read_buffer, bytes_to_read); + + // Note: SQLite validates the database header itself when opening the database, + // so we don't need to duplicate that validation here. + + // If we read less than requested, zero-fill the remainder + if (bytes_to_read < amount) { + memset(static_cast(buffer) + bytes_to_read, 0, amount - bytes_to_read); + } + + // Update read-ahead state after successful read + update_read_ahead_state(offset, bytes_to_read); + + // Return appropriate code based on whether we satisfied the full request + return (bytes_to_read < amount) ? SQLITE_IOERR_SHORT_READ : SQLITE_OK; +} + +sqlite3_int64 DuckDBCachedFile::get_file_size() { + try { + ensure_initialized(); + return static_cast(caching_handle->GetFileSize()); + } catch (...) { + return -1; + } +} + +idx_t DuckDBCachedFile::calculate_read_ahead_size(sqlite3_int64 offset, int amount) const { + // First read or non-sequential access - use minimum size + if (last_read_offset == -1 || !is_sequential_read(offset)) { + return MIN_READAHEAD_SIZE; + } + + // Sequential read - double the current size up to maximum + const idx_t next_size = current_readahead_size * 2; + return MinValue(next_size, MAX_READAHEAD_SIZE); +} + +bool DuckDBCachedFile::is_sequential_read(sqlite3_int64 offset) const { + // Consider sequential if: + // 1. Reading from exactly where last read ended, OR + // 2. Reading within SEQUENTIAL_THRESHOLD of where last read ended + return (offset >= last_read_end) && + (offset <= last_read_end + static_cast(SEQUENTIAL_THRESHOLD)); +} + +void DuckDBCachedFile::update_read_ahead_state(sqlite3_int64 offset, int amount) { + // Update read-ahead size based on access pattern + if (is_sequential_read(offset)) { + // Sequential read - grow read-ahead size + current_readahead_size = MinValue(current_readahead_size * 2, MAX_READAHEAD_SIZE); + } else { + // Non-sequential read - reset to minimum + current_readahead_size = MIN_READAHEAD_SIZE; + } + + // Update position tracking + last_read_offset = offset; + last_read_end = offset + amount; +} + + +//===--------------------------------------------------------------------===// +// SQLiteDuckDBCacheVFS Implementation +//===--------------------------------------------------------------------===// + +bool SQLiteDuckDBCacheVFS::CanHandlePath(ClientContext &context, const string &path) { + return FileSystem::IsRemoteFile(path); +} + +void SQLiteDuckDBCacheVFS::Register(ClientContext &context) { + auto& registry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + // Check if this context already has a VFS registered + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + // Already registered for this context + return; + } + + // Find SQLite's default VFS to delegate some operations + sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); + if (!default_vfs) { + throw InternalException("Failed to find default SQLite VFS - SQLite may not be properly initialized"); + } + + // Create a new VFS wrapper for this context + auto wrapper = make_uniq(); + wrapper->context = &context; + + // Allocate VFS name using SQLite's allocator for DLL safety + const string temp_name = get_unique_vfs_name(&context); + wrapper->vfs_name = static_cast(sqlite3_malloc64(temp_name.length() + 1)); + if (!wrapper->vfs_name) { + throw InternalException("Failed to allocate memory for VFS name"); + } + memcpy(wrapper->vfs_name, temp_name.c_str(), temp_name.length() + 1); + + // Initialize the IO methods for this VFS instance + InitializeIOMethods(wrapper->io_methods); + + // Initialize the VFS structure + memset(&wrapper->base, 0, sizeof(wrapper->base)); + wrapper->base.iVersion = 1; + wrapper->base.szOsFile = sizeof(SQLiteDuckDBCachedFile); + wrapper->base.mxPathname = default_vfs->mxPathname; + wrapper->base.zName = wrapper->vfs_name; // Now using C-style string + wrapper->base.pAppData = wrapper.get(); // Store pointer to wrapper + + // Configure VFS methods + wrapper->base.xOpen = Open; + wrapper->base.xDelete = Delete; + wrapper->base.xAccess = Access; + wrapper->base.xFullPathname = FullPathname; + wrapper->base.xDlOpen = DlOpen; + wrapper->base.xDlError = DlError; + wrapper->base.xDlSym = DlSym; + wrapper->base.xDlClose = DlClose; + wrapper->base.xRandomness = Randomness; + wrapper->base.xSleep = Sleep; + wrapper->base.xCurrentTime = CurrentTime; + wrapper->base.xGetLastError = GetLastError; + + // Register this VFS with SQLite + int rc = sqlite3_vfs_register(&wrapper->base, 0); + if (rc != SQLITE_OK) { + throw InternalException("Failed to register DuckDB Cache VFS: %s", sqlite3_errstr(rc)); + } + + // Store in registry - wrapper ownership transfers to registry + registry_data.registry[&context] = std::move(wrapper); +} + +// Unregister the VFS associated with a ClientContext when it's being destroyed. +// This ensures proper cleanup of VFS resources. +void SQLiteDuckDBCacheVFS::Unregister(ClientContext &context) { + auto& registry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + // Unregister from SQLite + sqlite3_vfs_unregister(&it->second->base); + // Remove from registry + registry_data.registry.erase(it); + } +} + +// Get the unique VFS name associated with a specific ClientContext. +// Returns the default VFS name if no specific VFS is registered for this context. +const char *SQLiteDuckDBCacheVFS::GetVFSNameForContext(ClientContext &context) { + auto& registry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end() && it->second->vfs_name) { + return it->second->vfs_name; // Return the C-style string directly + } + + // Fallback to default name if not found (shouldn't happen) + return GetVFSName(); +} + +//===--------------------------------------------------------------------===// +// VFS Methods +//===--------------------------------------------------------------------===// + +// Macro to delegate VFS operations to SQLite's default VFS implementation. +// Used for system-level operations (randomness, sleep, time) that don't involve +// file I/O and thus don't need special handling for remote files. +#define DELEGATE_TO_DEFAULT_VFS(method_name, ...) \ + sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); \ + if (default_vfs && default_vfs->method_name) { \ + return default_vfs->method_name(default_vfs, __VA_ARGS__); \ + } \ + return SQLITE_OK; + +int SQLiteDuckDBCacheVFS::Open(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags) { + // Get wrapper for error context + auto *wrapper = vfs && vfs->pAppData ? static_cast(vfs->pAppData) : nullptr; + + return SafeVFSCall(SQLITE_CANTOPEN, [&]() { + // Validate parameters and ensure read-only access + if (!vfs || !filename || !file || (flags & SQLITE_OPEN_READONLY) == 0) { + return SQLITE_CANTOPEN; + } + + // Ensure SQLite allocated enough space for our file structure + if (vfs->szOsFile < static_cast(sizeof(SQLiteDuckDBCachedFile))) { + return SQLITE_CANTOPEN; + } + + auto *duckdb_file = reinterpret_cast(file); + + // Retrieve the VFS wrapper and context from pAppData + if (!vfs->pAppData) { + return SQLITE_CANTOPEN; + } + + // wrapper is already declared in outer scope + ClientContext *context = wrapper->context; + + // Defensive check: Validate context is still valid + if (!context || !context->db) { + return SQLITE_CANTOPEN; + } + + // Initialize the structure members properly + duckdb_file->base.pMethods = &wrapper->io_methods; + duckdb_file->duckdb_file = nullptr; + + // Create the DuckDB file handle with proper exception handling + try { + duckdb_file->duckdb_file = new DuckDBCachedFile(*context, filename); + } catch (...) { + // Clean up on failure + duckdb_file->base.pMethods = nullptr; + duckdb_file->duckdb_file = nullptr; + return SQLITE_CANTOPEN; + } + + // Don't validate SQLite header here - defer until first read + // to avoid triggering DuckDB operations in VFS callbacks + + if (out_flags) { + *out_flags = flags; + } + + return SQLITE_OK; + }, wrapper, filename, "xOpen"); +} + +int SQLiteDuckDBCacheVFS::Delete(sqlite3_vfs *vfs, const char *filename, int sync_dir) { + // Remote files cannot be deleted through this VFS + return SQLITE_IOERR_DELETE; +} + +int SQLiteDuckDBCacheVFS::Access(sqlite3_vfs *vfs, const char *filename, int flags, int *result) { + auto *wrapper = vfs && vfs->pAppData ? static_cast(vfs->pAppData) : nullptr; + + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!filename || !result) { + return SQLITE_IOERR; + } + + // Initialize result to safe default + *result = 0; + + // For remote files, we need to handle journal/WAL file checks properly. + // SQLite uses Access() to check for the existence of journal and WAL files + // to determine if a database might have uncommitted transactions. + + if (flags == SQLITE_ACCESS_EXISTS) { + // Check if this is a journal or WAL file by examining the suffix + const string file_path(filename); + bool is_journal = false; + bool is_wal = false; + + // Check for journal file suffixes + if (file_path.size() > 8) { + string suffix = file_path.substr(file_path.size() - 8); + if (suffix == "-journal" || suffix == "-wal") { + is_journal = (suffix == "-journal"); + is_wal = (suffix == "-wal"); + } + } + + if (is_journal || is_wal) { + // For journal/WAL files, we need to check if they actually exist + // This is critical for SQLite to properly detect hot journals + if (!vfs->pAppData) { + *result = 0; + return SQLITE_OK; + } + + auto *wrapper = static_cast(vfs->pAppData); + ClientContext *context = wrapper->context; + + if (!context || !context->db) { + *result = 0; + return SQLITE_OK; + } + + // Try to check file existence through DuckDB's filesystem + try { + auto &fs = context->db->GetFileSystem(); + *result = fs.FileExists(file_path) ? 1 : 0; + } catch (...) { + // If we can't check, assume it doesn't exist + *result = 0; + } + } else { + // For the main database file, return 0 to let SQLite try to open it + // This avoids triggering DuckDB operations in the wrong context + *result = 0; + } + } else if (flags == SQLITE_ACCESS_READWRITE) { + // Remote files are always read-only + *result = 0; + } else if (flags == SQLITE_ACCESS_READ) { + // We can read remote files, but defer actual check to open + *result = 0; + } else { + // Unknown access type + *result = 0; + } + + return SQLITE_OK; + }, wrapper, filename); +} + +int SQLiteDuckDBCacheVFS::FullPathname(sqlite3_vfs *vfs, const char *filename, int out_size, char *out_buf) { + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!filename || !out_buf || out_size <= 0) { + return SQLITE_IOERR; + } + + // Remote paths are already absolute URLs - return as-is + strncpy(out_buf, filename, out_size - 1); + out_buf[out_size - 1] = '\0'; + return SQLITE_OK; + }); +} + +// These methods don't need special handling - delegate to default VFS +int SQLiteDuckDBCacheVFS::Randomness(sqlite3_vfs *vfs, int bytes, char *out) { + DELEGATE_TO_DEFAULT_VFS(xRandomness, bytes, out); +} + +int SQLiteDuckDBCacheVFS::Sleep(sqlite3_vfs *vfs, int microseconds) { + DELEGATE_TO_DEFAULT_VFS(xSleep, microseconds); +} + +int SQLiteDuckDBCacheVFS::CurrentTime(sqlite3_vfs *vfs, double *time) { + DELEGATE_TO_DEFAULT_VFS(xCurrentTime, time); +} + +// Dynamic library operations are not supported for remote files +void *SQLiteDuckDBCacheVFS::DlOpen(sqlite3_vfs *vfs, const char *filename) { + return nullptr; +} + +void SQLiteDuckDBCacheVFS::DlError(sqlite3_vfs *vfs, int bytes, char *err_msg) { + try { + if (err_msg && bytes > 0) { + strncpy(err_msg, "Dynamic loading not supported for remote files", bytes - 1); + err_msg[bytes - 1] = '\0'; + } + } catch (...) { + // Best effort - if we can't even set the error message, just return + if (err_msg && bytes > 0) { + err_msg[0] = '\0'; + } + } +} + +void (*SQLiteDuckDBCacheVFS::DlSym(sqlite3_vfs *vfs, void *handle, const char *symbol))(void) { + return nullptr; +} + +void SQLiteDuckDBCacheVFS::DlClose(sqlite3_vfs *vfs, void *handle) { + // No-op - dynamic libraries not supported +} + +int SQLiteDuckDBCacheVFS::GetLastError(sqlite3_vfs *vfs, int bytes, char *err_msg) { + return SafeVFSCall(0, [&]() { + if (!vfs || !vfs->pAppData || !err_msg || bytes <= 0) { + return 0; + } + + auto *wrapper = static_cast(vfs->pAppData); + const string error = wrapper->get_last_error(); + + if (error.empty()) { + err_msg[0] = '\0'; + return 0; + } + + // Copy error message to buffer + strncpy(err_msg, error.c_str(), bytes - 1); + err_msg[bytes - 1] = '\0'; + + return static_cast(error.length()); + }); +} + +//===--------------------------------------------------------------------===// +// File Methods +//===--------------------------------------------------------------------===// + +int SQLiteDuckDBCacheVFS::Close(sqlite3_file *file) { + return SafeVFSCall(SQLITE_OK, [&]() { + if (file) { + auto *duckdb_file = reinterpret_cast(file); + // Explicitly delete the raw pointer + delete duckdb_file->duckdb_file; + duckdb_file->duckdb_file = nullptr; + } + return SQLITE_OK; + }); +} + +int SQLiteDuckDBCacheVFS::Read(sqlite3_file *file, void *buffer, int amount, sqlite3_int64 offset) { + return SafeVFSCall(SQLITE_IOERR_READ, [&]() { + if (!file || !buffer) { + return SQLITE_IOERR_READ; + } + + auto *duckdb_file = reinterpret_cast(file); + if (!duckdb_file->duckdb_file) { + return SQLITE_IOERR_READ; + } + + return duckdb_file->duckdb_file->Read(buffer, amount, offset); + }); +} + +int SQLiteDuckDBCacheVFS::FileSize(sqlite3_file *file, sqlite3_int64 *size) { + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!file || !size) { + return SQLITE_IOERR; + } + + auto *duckdb_file = reinterpret_cast(file); + if (!duckdb_file->duckdb_file) { + return SQLITE_IOERR; + } + + *size = duckdb_file->duckdb_file->get_file_size(); + return SQLITE_OK; + }); +} + +// Write operations return SQLITE_READONLY since remote files are read-only +int SQLiteDuckDBCacheVFS::Write(sqlite3_file *file, const void *buffer, int amount, sqlite3_int64 offset) { + return SQLITE_READONLY; +} + +int SQLiteDuckDBCacheVFS::Truncate(sqlite3_file *file, sqlite3_int64 size) { + return SQLITE_READONLY; +} + +int SQLiteDuckDBCacheVFS::Sync(sqlite3_file *file, int flags) { + // No-op for read-only files + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::Lock(sqlite3_file *file, int level) { + // Remote files don't need locking - they're read-only and immutable + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::Unlock(sqlite3_file *file, int level) { + // Remote files don't need locking - they're read-only and immutable + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::CheckReservedLock(sqlite3_file *file, int *result) { + if (result) { + *result = 0; + } + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::FileControl(sqlite3_file *file, int op, void *arg) { + // No special file control operations are implemented + return SQLITE_NOTFOUND; +} + +int SQLiteDuckDBCacheVFS::SectorSize(sqlite3_file *file) { + return SQLITE_SECTOR_SIZE; +} + +int SQLiteDuckDBCacheVFS::DeviceCharacteristics(sqlite3_file *file) { + // Remote files are immutable - they cannot be modified + return SQLITE_IOCAP_IMMUTABLE; +} + +} // namespace duckdb \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_00_vfs_registration.test b/test/sql/scanner/http_sqlite_00_vfs_registration.test new file mode 100644 index 0000000..fd85a46 --- /dev/null +++ b/test/sql/scanner/http_sqlite_00_vfs_registration.test @@ -0,0 +1,52 @@ +# name: test/sql/scanner/http_sqlite_00_vfs_registration.test +# description: Test HTTP VFS registration and URL detection for SQLite scanner +# group: [sqlite_scanner] + +require sqlite_scanner + +# Test 1: Verify SQLite scanner extension is available +statement ok +SELECT 'SQLite scanner extension loaded'; + +# Test 2: HTTP URLs should fail appropriately without httpfs loaded +statement error +SELECT * FROM sqlite_scan('http://example.com/test.db', 'test'); +---- +unable to open database file + +# Test 3: HTTPS URLs should fail appropriately without httpfs loaded +statement error +SELECT * FROM sqlite_scan('https://example.com/test.db', 'test'); +---- +unable to open database file + +# Test 4: S3 URLs should fail appropriately without httpfs loaded +statement error +SELECT * FROM sqlite_scan('s3://bucket/test.db', 'test'); +---- +unable to open database file + +# Test 5: Load httpfs to register HTTP VFS +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 6: After httpfs loads, HTTP URLs should be recognized but fail with network error (not VFS error) +statement error +SELECT * FROM sqlite_scan('http://nonexistent-domain-12345.com/test.db', 'test'); +---- +unable to open database file + +# Test 7: HTTPS URLs should also be recognized after httpfs loads +statement error +SELECT * FROM sqlite_scan('https://nonexistent-domain-12345.com/test.db', 'test'); +---- +unable to open database file + +# Test 8: Regular file paths should still work as before +statement error +SELECT * FROM sqlite_scan('nonexistent.db', 'test'); +---- +unable to open database file \ No newline at end of file From 7a1913dfd108431a191130a61d51efb2e10301e3 Mon Sep 17 00:00:00 2001 From: ak2k <19240940+ak2k@users.noreply.github.com> Date: Fri, 27 Jun 2025 15:56:43 -0400 Subject: [PATCH 4/5] Enable HTTP/HTTPS SQLite database access via sqlite_scan This commit adds support for querying remote SQLite databases over HTTP/HTTPS using the sqlite_scan() function. Remote files are accessed through DuckDB's custom VFS with caching support. Implementation details: - Add OpenWithVFS() to handle remote SQLite databases via custom VFS - Implement helper methods for clean separation of local vs remote handling - Register VFS cleanup callback to properly clean up on connection close - Add comprehensive error handling with HTTP status code mapping - Pass ClientContext through scanner to enable VFS functionality The implementation uses DuckDB's CachingFileSystem for efficient block-level caching of remote file data, minimizing network requests when accessing SQLite databases over HTTP. --- src/include/sqlite_db.hpp | 14 +++ src/sqlite_db.cpp | 119 ++++++++++++++++-- src/sqlite_extension.cpp | 44 +++++-- src/sqlite_scanner.cpp | 6 +- .../scanner/http_sqlite_01_basic_scan.test | 28 +++++ .../http_sqlite_02_concurrent_scans.test | 20 +++ test/sql/scanner/http_sqlite_03_joins.test | 25 ++++ test/sql/scanner/http_sqlite_06_cte.test | 27 ++++ .../scanner/http_sqlite_07_consistency.test | 29 +++++ .../http_sqlite_08_error_handling.test | 29 +++++ .../scanner/http_sqlite_09_http_errors.test | 47 +++++++ 11 files changed, 367 insertions(+), 21 deletions(-) create mode 100644 test/sql/scanner/http_sqlite_01_basic_scan.test create mode 100644 test/sql/scanner/http_sqlite_02_concurrent_scans.test create mode 100644 test/sql/scanner/http_sqlite_03_joins.test create mode 100644 test/sql/scanner/http_sqlite_06_cte.test create mode 100644 test/sql/scanner/http_sqlite_07_consistency.test create mode 100644 test/sql/scanner/http_sqlite_08_error_handling.test create mode 100644 test/sql/scanner/http_sqlite_09_http_errors.test diff --git a/src/include/sqlite_db.hpp b/src/include/sqlite_db.hpp index 0312475..1515ce3 100644 --- a/src/include/sqlite_db.hpp +++ b/src/include/sqlite_db.hpp @@ -31,7 +31,10 @@ class SQLiteDB { sqlite3 *db; public: + //! Open a SQLite database (local files only) static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); + //! Open a SQLite database with support for both local and remote files (HTTP/HTTPS) + //! @param context Required for remote file access via DuckDB's VFS static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared = false); bool TryPrepare(const string &query, SQLiteStatement &result); SQLiteStatement Prepare(const string &query); @@ -55,6 +58,17 @@ class SQLiteDB { bool IsOpen(); void Close(); + +private: + //! Internal implementation methods for opening SQLite databases + static int GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, bool is_remote = false); + static void ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options); + static void HandleOpenError(const string &path, int rc, ClientContext *context = nullptr); + static SQLiteDB OpenWithVFS(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared); + //! Open a local SQLite database file (no remote support) + static SQLiteDB OpenLocal(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); + + static void CheckDBValid(sqlite3 *db); }; } // namespace duckdb diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index fa0061b..23f3656 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -4,9 +4,11 @@ #include "duckdb/storage/table_storage_info.hpp" #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/parser.hpp" +#include "duckdb/common/file_open_flags.hpp" #include "duckdb/main/client_context.hpp" #include "sqlite_db.hpp" #include "sqlite_stmt.hpp" +#include "sqlite_duckdb_vfs_cache.hpp" namespace duckdb { @@ -31,51 +33,148 @@ SQLiteDB &SQLiteDB::operator=(SQLiteDB &&other) noexcept { return *this; } -SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bool is_shared) { - SQLiteDB result; +int SQLiteDB::GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, bool is_remote) { int flags = SQLITE_OPEN_PRIVATECACHE; - if (options.access_mode == AccessMode::READ_ONLY) { + + if (is_remote || options.access_mode == AccessMode::READ_ONLY) { + // Remote databases are always read-only flags |= SQLITE_OPEN_READONLY; } else { flags |= SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; } + if (!is_shared) { // FIXME: we should just make sure we are not re-using the same `sqlite3` // object across threads flags |= SQLITE_OPEN_NOMUTEX; } + flags |= SQLITE_OPEN_EXRESCODE; - auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, nullptr); - if (rc != SQLITE_OK) { - throw std::runtime_error("Unable to open database \"" + path + "\": " + string(sqlite3_errstr(rc))); - } - // default busy time-out of 5 seconds + return flags; +} + +void SQLiteDB::ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options) { if (options.busy_timeout > 0) { if (options.busy_timeout > NumericLimits::Maximum()) { throw std::runtime_error("busy_timeout out of range - must be within " "valid range for type int"); } - rc = sqlite3_busy_timeout(result.db, int(options.busy_timeout)); + auto rc = sqlite3_busy_timeout(db, int(options.busy_timeout)); if (rc != SQLITE_OK) { throw std::runtime_error("Failed to set busy timeout"); } } +} + +void SQLiteDB::HandleOpenError(const string &path, int rc, ClientContext *context) { + // Map SQLite error codes to user-friendly messages + string error_msg; + int primary_error = rc & 0xFF; // Extract primary error code from extended error code + + // Check if this is a remote file handled by our custom VFS + bool is_remote_file = context && SQLiteDuckDBCacheVFS::CanHandlePath(*context, path); + + switch (primary_error) { + case SQLITE_CANTOPEN: + error_msg = "unable to open database file"; + break; + case SQLITE_PERM: + error_msg = "access permission denied"; + break; + case SQLITE_IOERR: + // For remote files, I/O errors are likely network/connection issues + if (is_remote_file) { + error_msg = "unable to open database file"; + } else { + error_msg = "disk I/O error"; + } + break; + case SQLITE_BUSY: + error_msg = "database is locked"; + break; + case SQLITE_NOMEM: + error_msg = "out of memory"; + break; + case SQLITE_READONLY: + error_msg = "attempt to write a readonly database"; + break; + case SQLITE_CORRUPT: + error_msg = "file is not a database"; + break; + default: + // Fall back to SQLite's error message for other codes + error_msg = sqlite3_errstr(rc); + break; + } + throw std::runtime_error("Unable to open database \"" + path + "\": " + error_msg); +} + +// Opens a local SQLite database file using standard SQLite file handling +SQLiteDB SQLiteDB::OpenLocal(const string &path, const SQLiteOpenOptions &options, bool is_shared) { + SQLiteDB result; + int flags = GetOpenFlags(options, is_shared, false); + + auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, nullptr); + if (rc != SQLITE_OK) { + HandleOpenError(path, rc); + } + + ApplyBusyTimeout(result.db, options); + if (!options.journal_mode.empty()) { result.Execute("PRAGMA journal_mode=" + KeywordHelper::EscapeQuotes(options.journal_mode, '\'')); } return result; } +// Opens a remote SQLite database using DuckDB's custom VFS for HTTP/HTTPS support +SQLiteDB SQLiteDB::OpenWithVFS(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared) { + // Register our VFS to handle this remote file + SQLiteDuckDBCacheVFS::Register(context); + + SQLiteDB result; + int flags = GetOpenFlags(options, is_shared, true); + + + auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, SQLiteDuckDBCacheVFS::GetVFSNameForContext(context)); + if (rc != SQLITE_OK) { + HandleOpenError(path, rc, &context); + } + + ApplyBusyTimeout(result.db, options); + + return result; +} + +// Original entry point - local files only (preserved for backward compatibility) +SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bool is_shared) { + return OpenLocal(path, options, is_shared); +} + +// Main entry point for opening SQLite databases - handles both local and remote files +// Remote files (HTTP/HTTPS) use DuckDB's VFS with caching, local files use standard SQLite SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared) { - return Open(path, options, is_shared); + if (FileSystem::IsRemoteFile(path)) { + if (SQLiteDuckDBCacheVFS::CanHandlePath(context, path)) { + return OpenWithVFS(path, options, context, is_shared); + } else { + // Path not supported by our VFS - use standard SQLite + return OpenLocal(path, options, is_shared); + } + } else { + // Local files use standard SQLite file handling + return OpenLocal(path, options, is_shared); + } } + bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { stmt.db = db; if (debug_sqlite_print_queries) { Printer::Print(query + "\n"); } auto rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt.stmt, nullptr); + if (rc != SQLITE_OK) { return false; } diff --git a/src/sqlite_extension.cpp b/src/sqlite_extension.cpp index f85328f..92b9310 100644 --- a/src/sqlite_extension.cpp +++ b/src/sqlite_extension.cpp @@ -4,6 +4,7 @@ #include "duckdb.hpp" #include "sqlite_db.hpp" +#include "sqlite_duckdb_vfs_cache.hpp" #include "sqlite_scanner.hpp" #include "sqlite_storage.hpp" #include "sqlite_scanner_extension.hpp" @@ -11,6 +12,7 @@ #include "duckdb/catalog/catalog.hpp" #include "duckdb/main/extension_util.hpp" #include "duckdb/parser/parsed_data/create_table_function_info.hpp" +#include "duckdb/planner/extension_callback.hpp" using namespace duckdb; @@ -20,15 +22,32 @@ static void SetSqliteDebugQueryPrint(ClientContext &context, SetScope scope, Val SQLiteDB::DebugSetPrintQueries(BooleanValue::Get(parameter)); } +// Cleanup callback for VFS when connection is closed +class SQLiteVFSCleanupCallback : public ExtensionCallback { +public: + void OnConnectionClosed(ClientContext &context) override { + // Unregister the VFS for this context if it was registered + SQLiteDuckDBCacheVFS::Unregister(context); + } +}; + static void LoadInternal(DatabaseInstance &db) { - SqliteScanFunction sqlite_fun; - ExtensionUtil::RegisterFunction(db, sqlite_fun); + // Create function instances inline like built-in functions do + // This avoids any static storage issues + { + SqliteScanFunction sqlite_scan; + ExtensionUtil::RegisterFunction(db, sqlite_scan); + } - SqliteAttachFunction attach_func; - ExtensionUtil::RegisterFunction(db, attach_func); + { + SqliteAttachFunction sqlite_attach; + ExtensionUtil::RegisterFunction(db, sqlite_attach); + } - SQLiteQueryFunction query_func; - ExtensionUtil::RegisterFunction(db, query_func); + { + SQLiteQueryFunction sqlite_query; + ExtensionUtil::RegisterFunction(db, sqlite_query); + } auto &config = DBConfig::GetConfig(db); config.AddExtensionOption("sqlite_all_varchar", "Load all SQLite columns as VARCHAR columns", LogicalType::BOOLEAN); @@ -36,7 +55,14 @@ static void LoadInternal(DatabaseInstance &db) { config.AddExtensionOption("sqlite_debug_show_queries", "DEBUG SETTING: print all queries sent to SQLite to stdout", LogicalType::BOOLEAN, Value::BOOLEAN(false), SetSqliteDebugQueryPrint); - config.storage_extensions["sqlite_scanner"] = make_uniq(); + if (config.storage_extensions.find("sqlite_scanner") == config.storage_extensions.end()) { + config.storage_extensions["sqlite_scanner"] = make_uniq(); + } + + // Register cleanup callback for VFS + config.extension_callbacks.push_back(make_uniq()); + + // HTTP SQLite support is handled entirely by VFS through DuckDB's CachingFileSystem } void SqliteScannerExtension::Load(DuckDB &db) { @@ -52,6 +78,8 @@ DUCKDB_EXTENSION_API const char *sqlite_scanner_version() { } DUCKDB_EXTENSION_API void sqlite_scanner_storage_init(DBConfig &config) { - config.storage_extensions["sqlite_scanner"] = make_uniq(); + if (config.storage_extensions.find("sqlite_scanner") == config.storage_extensions.end()) { + config.storage_extensions["sqlite_scanner"] = make_uniq(); + } } } diff --git a/src/sqlite_scanner.cpp b/src/sqlite_scanner.cpp index eb8b5ff..56b2289 100644 --- a/src/sqlite_scanner.cpp +++ b/src/sqlite_scanner.cpp @@ -55,7 +55,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction SQLiteStatement stmt; SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - db = SQLiteDB::Open(result->file_name, options); + db = SQLiteDB::Open(result->file_name, options, context); ColumnList columns; vector> constraints; @@ -96,7 +96,7 @@ static void SqliteInitInternal(ClientContext &context, const SqliteBindData &bin if (!local_state.db) { SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - local_state.owned_db = SQLiteDB::Open(bind_data.file_name.c_str(), options); + local_state.owned_db = SQLiteDB::Open(bind_data.file_name.c_str(), options, context); local_state.db = &local_state.owned_db; } string sql; @@ -392,7 +392,7 @@ static void AttachFunction(ClientContext &context, TableFunctionInput &data_p, D SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - SQLiteDB db = SQLiteDB::Open(data.file_name, options); + SQLiteDB db = SQLiteDB::Open(data.file_name, options, context); auto dconn = Connection(context.db->GetDatabase(context)); { auto tables = db.GetTables(); diff --git a/test/sql/scanner/http_sqlite_01_basic_scan.test b/test/sql/scanner/http_sqlite_01_basic_scan.test new file mode 100644 index 0000000..089f647 --- /dev/null +++ b/test/sql/scanner/http_sqlite_01_basic_scan.test @@ -0,0 +1,28 @@ +# name: test/sql/scanner/http_sqlite_01_basic_scan.test +# description: Basic HTTP SQLite scan functionality +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 1: Basic remote SQLite query +query I +SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist'); +---- +275 + +# Test 2: Query with WHERE clause and ORDER BY +query IT +SELECT ArtistId, Name FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist') +WHERE Name LIKE 'A%' +ORDER BY Name +LIMIT 3; +---- +43 A Cor Do Som +1 AC/DC +230 Aaron Copland & London Symphony Orchestra \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_02_concurrent_scans.test b/test/sql/scanner/http_sqlite_02_concurrent_scans.test new file mode 100644 index 0000000..ba19847 --- /dev/null +++ b/test/sql/scanner/http_sqlite_02_concurrent_scans.test @@ -0,0 +1,20 @@ +# name: test/sql/scanner/http_sqlite_02_concurrent_scans.test +# description: Multiple concurrent scans of remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 3: Multiple concurrent scans +query III +SELECT + (SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist')) as artists, + (SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Album')) as albums, + (SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track')) as tracks; +---- +275 347 3503 \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_03_joins.test b/test/sql/scanner/http_sqlite_03_joins.test new file mode 100644 index 0000000..5ecb866 --- /dev/null +++ b/test/sql/scanner/http_sqlite_03_joins.test @@ -0,0 +1,25 @@ +# name: test/sql/scanner/http_sqlite_03_joins.test +# description: Joins between remote SQLite tables +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 4: Join between remote tables +query TI +SELECT ar.Name as Artist, COUNT(*) as AlbumCount +FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist') ar +JOIN sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Album') al +ON ar.ArtistId = al.ArtistId +GROUP BY ar.Name +ORDER BY AlbumCount DESC, ar.Name +LIMIT 3; +---- +Iron Maiden 21 +Led Zeppelin 14 +Deep Purple 11 \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_06_cte.test b/test/sql/scanner/http_sqlite_06_cte.test new file mode 100644 index 0000000..2852aed --- /dev/null +++ b/test/sql/scanner/http_sqlite_06_cte.test @@ -0,0 +1,27 @@ +# name: test/sql/scanner/http_sqlite_06_cte.test +# description: CTE with remote SQLite tables +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 9: CTE with remote tables +query TI +WITH top_genres AS ( + SELECT g.GenreId, g.Name, COUNT(*) as track_count + FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Genre') g + JOIN sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') t ON g.GenreId = t.GenreId + GROUP BY g.GenreId, g.Name + ORDER BY track_count DESC + LIMIT 3 +) +SELECT Name, track_count FROM top_genres ORDER BY Name; +---- +Latin 579 +Metal 374 +Rock 1297 \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_07_consistency.test b/test/sql/scanner/http_sqlite_07_consistency.test new file mode 100644 index 0000000..aa02b12 --- /dev/null +++ b/test/sql/scanner/http_sqlite_07_consistency.test @@ -0,0 +1,29 @@ +# name: test/sql/scanner/http_sqlite_07_consistency.test +# description: Consistency check for multiple scans +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 10: Consistency check - multiple scans return same results +query IIIT +WITH counts AS ( + SELECT 'scan1' as scan_id, COUNT(*) as count FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') + UNION ALL + SELECT 'scan2', COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') + UNION ALL + SELECT 'scan3', COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') +) +SELECT + COUNT(*) as scan_count, + MIN(count) as min_count, + MAX(count) as max_count, + CASE WHEN MIN(count) = MAX(count) THEN 'CONSISTENT' ELSE 'INCONSISTENT' END as consistency +FROM counts; +---- +3 3503 3503 CONSISTENT \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_08_error_handling.test b/test/sql/scanner/http_sqlite_08_error_handling.test new file mode 100644 index 0000000..be61b98 --- /dev/null +++ b/test/sql/scanner/http_sqlite_08_error_handling.test @@ -0,0 +1,29 @@ +# name: test/sql/scanner/http_sqlite_08_error_handling.test +# description: Error handling for remote SQLite databases +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test: Invalid domain (network error) +statement error +SELECT * FROM sqlite_scan('https://this-domain-does-not-exist-12345.invalid/test.db', 'test'); +---- +unable to open database file + +# Test: Valid file but not SQLite format +statement error +SELECT * FROM sqlite_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/README.md', 'test'); +---- +file is not a database + +# Test: Non-existent file on valid domain (404) +statement error +SELECT * FROM sqlite_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/definitely_nonexistent.db', 'test'); +---- +unable to open database file \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_09_http_errors.test b/test/sql/scanner/http_sqlite_09_http_errors.test new file mode 100644 index 0000000..df20d23 --- /dev/null +++ b/test/sql/scanner/http_sqlite_09_http_errors.test @@ -0,0 +1,47 @@ +# name: test/sql/scanner/http_sqlite_09_http_errors.test +# description: Test HTTP error code mapping to SQLite errors +# group: [sqlite_scanner] + +# Note: httpbin.org tests don't check specific error messages because: +# 1. httpbin.org may be rate-limited or down, causing connection errors +# 2. Different platforms may format HTTP errors differently +# 3. We still verify that these URLs produce errors, just not the exact message + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 404 error - should map to "unable to open database file" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/404', 'test'); +---- + +# Test 403 error - should map to "access permission denied" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/403', 'test'); +---- + +# Test 401 error - should also map to "access permission denied" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/401', 'test'); +---- + +# Test 500 error - should map to "unable to open database file" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/500', 'test'); +---- + +# Test 429 error - should map to "database is locked" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/429', 'test'); +---- + +# Test with non-existent GitHub file (real-world 404 scenario) +statement error +SELECT * FROM sqlite_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/nonexistent_file_404.db', 'test'); +---- +unable to open database file \ No newline at end of file From 5d4e241f2010cfb378dfbb6ff169c568cf87266c Mon Sep 17 00:00:00 2001 From: ak2k <19240940+ak2k@users.noreply.github.com> Date: Fri, 27 Jun 2025 16:49:53 -0400 Subject: [PATCH 5/5] Add HTTP ATTACH support and improve error handling This commit completes the HTTP SQLite support by enabling ATTACH functionality for remote databases and improving error handling throughout the codebase. Key changes: - Replace generic std::runtime_error with specific DuckDB exception types (BinderException, ConnectionException, IOException, InternalException) - Update GetSQLiteTransaction to handle missing transactions gracefully - Add validation for busy_timeout in SQLiteAttach to prevent overflow - Implement proper move semantics using swap - Add Copy/Equals methods to AttachFunctionData for proper state management - Enable ATTACH functionality for remote SQLite databases - Defer transaction start to prevent deadlocks with remote file access --- .../storage/sqlite_transaction_manager.hpp | 4 +- src/sqlite_db.cpp | 36 +++++++++----- src/sqlite_scanner.cpp | 22 ++++++++- src/sqlite_storage.cpp | 8 ++- src/storage/sqlite_catalog.cpp | 3 +- src/storage/sqlite_schema_entry.cpp | 15 ++++-- src/storage/sqlite_table_entry.cpp | 5 +- src/storage/sqlite_transaction_manager.cpp | 17 +++++-- test/sql/scanner/http_sqlite_04_attach.test | 24 +++++++++ .../http_sqlite_05_complex_queries.test | 49 +++++++++++++++++++ 10 files changed, 155 insertions(+), 28 deletions(-) create mode 100644 test/sql/scanner/http_sqlite_04_attach.test create mode 100644 test/sql/scanner/http_sqlite_05_complex_queries.test diff --git a/src/include/storage/sqlite_transaction_manager.hpp b/src/include/storage/sqlite_transaction_manager.hpp index 8f9cafc..38452a1 100644 --- a/src/include/storage/sqlite_transaction_manager.hpp +++ b/src/include/storage/sqlite_transaction_manager.hpp @@ -27,8 +27,10 @@ class SQLiteTransactionManager : public TransactionManager { private: SQLiteCatalog &sqlite_catalog; - mutex transaction_lock; reference_map_t> transactions; + + // Function-local static mutex to avoid Windows DLL initialization issues + static mutex& GetTransactionLock(); }; } // namespace duckdb diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index 23f3656..233ba41 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -4,7 +4,11 @@ #include "duckdb/storage/table_storage_info.hpp" #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/parser.hpp" +#include "duckdb/common/exception.hpp" +#include "duckdb/common/exception/http_exception.hpp" #include "duckdb/common/file_open_flags.hpp" +#include "duckdb/common/swap.hpp" +#include "duckdb/common/mutex.hpp" #include "duckdb/main/client_context.hpp" #include "sqlite_db.hpp" #include "sqlite_stmt.hpp" @@ -24,12 +28,16 @@ SQLiteDB::~SQLiteDB() { Close(); } -SQLiteDB::SQLiteDB(SQLiteDB &&other) noexcept { - std::swap(db, other.db); +SQLiteDB::SQLiteDB(SQLiteDB &&other) noexcept : db(nullptr) { + swap(db, other.db); } SQLiteDB &SQLiteDB::operator=(SQLiteDB &&other) noexcept { - std::swap(db, other.db); + if (this != &other) { + // Close any existing database first + Close(); + swap(db, other.db); + } return *this; } @@ -56,12 +64,11 @@ int SQLiteDB::GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, boo void SQLiteDB::ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options) { if (options.busy_timeout > 0) { if (options.busy_timeout > NumericLimits::Maximum()) { - throw std::runtime_error("busy_timeout out of range - must be within " - "valid range for type int"); + throw BinderException("busy_timeout out of range - must be within valid range for type int"); } auto rc = sqlite3_busy_timeout(db, int(options.busy_timeout)); if (rc != SQLITE_OK) { - throw std::runtime_error("Failed to set busy timeout"); + throw ConnectionException("Failed to set busy timeout: %s", sqlite3_errmsg(db)); } } } @@ -106,7 +113,7 @@ void SQLiteDB::HandleOpenError(const string &path, int rc, ClientContext *contex error_msg = sqlite3_errstr(rc); break; } - throw std::runtime_error("Unable to open database \"" + path + "\": " + error_msg); + throw ConnectionException("Unable to open database \"%s\": %s", path, error_msg); } // Opens a local SQLite database file using standard SQLite file handling @@ -167,8 +174,14 @@ SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, Cl } } +void SQLiteDB::CheckDBValid(sqlite3 *db) { + if (!db) { + throw InternalException("SQLite database operation called with null database pointer"); + } +} bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { + CheckDBValid(db); stmt.db = db; if (debug_sqlite_print_queries) { Printer::Print(query + "\n"); @@ -184,20 +197,19 @@ bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { SQLiteStatement SQLiteDB::Prepare(const string &query) { SQLiteStatement stmt; if (!TryPrepare(query, stmt)) { - string error = "Failed to prepare query \"" + query + "\": " + string(sqlite3_errmsg(db)); - throw std::runtime_error(error); + throw BinderException("Failed to prepare query \"%s\": %s", query, sqlite3_errmsg(db)); } return stmt; } void SQLiteDB::Execute(const string &query) { + CheckDBValid(db); if (debug_sqlite_print_queries) { Printer::Print(query + "\n"); } auto rc = sqlite3_exec(db, query.c_str(), nullptr, nullptr, nullptr); if (rc != SQLITE_OK) { - string error = "Failed to execute query \"" + query + "\": " + string(sqlite3_errmsg(db)); - throw std::runtime_error(error); + throw IOException("Failed to execute query \"%s\": %s", query, sqlite3_errmsg(db)); } } @@ -259,7 +271,7 @@ void SQLiteDB::GetIndexInfo(const string &index_name, string &sql, string &table sql = stmt.GetValue(1); return; } - throw InternalException("GetViewInfo - index \"%s\" not found", index_name); + throw InternalException("GetIndexInfo - index \"%s\" not found", index_name); } void SQLiteDB::GetViewInfo(const string &view_name, string &sql) { diff --git a/src/sqlite_scanner.cpp b/src/sqlite_scanner.cpp index 56b2289..c5e9708 100644 --- a/src/sqlite_scanner.cpp +++ b/src/sqlite_scanner.cpp @@ -72,7 +72,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction } if (names.empty()) { - throw std::runtime_error("no columns for table " + result->table_name); + throw BinderException("Table \"%s\" has no columns", result->table_name); } if (!db.GetRowIdInfo(result->table_name, result->row_id_info)) { @@ -81,6 +81,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction result->names = names; result->types = return_types; + result->global_db = nullptr; return std::move(result); } @@ -313,7 +314,7 @@ static void SqliteScan(ClientContext &context, TableFunctionInput &data, DataChu out_vec, (const char *)sqlite3_value_blob(val), sqlite3_value_bytes(val)); break; default: - throw std::runtime_error(out_vec.GetType().ToString()); + throw InternalException("Unsupported type \"%s\" for SQLite value conversion", out_vec.GetType().ToString()); } } out_idx++; @@ -365,6 +366,23 @@ struct AttachFunctionData : public TableFunctionData { bool finished = false; bool overwrite = false; string file_name = ""; + + // Override virtual methods from FunctionData + unique_ptr Copy() const override { + auto result = make_uniq(); + result->finished = finished; + result->overwrite = overwrite; + result->file_name = file_name; + result->column_ids = column_ids; + return std::move(result); + } + + bool Equals(const FunctionData &other) const override { + auto &other_attach = other.Cast(); + return finished == other_attach.finished && + overwrite == other_attach.overwrite && + file_name == other_attach.file_name; + } }; static unique_ptr AttachBind(ClientContext &context, TableFunctionBindInput &input, diff --git a/src/sqlite_storage.cpp b/src/sqlite_storage.cpp index 09d212d..46150c9 100644 --- a/src/sqlite_storage.cpp +++ b/src/sqlite_storage.cpp @@ -9,6 +9,7 @@ #include "duckdb/transaction/transaction_manager.hpp" #include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp" #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" +#include "duckdb/common/limits.hpp" namespace duckdb { @@ -19,11 +20,16 @@ static unique_ptr SQLiteAttach(StorageExtensionInfo *storage_info, Clie options.access_mode = access_mode; for (auto &entry : info.options) { if (StringUtil::CIEquals(entry.first, "busy_timeout")) { - options.busy_timeout = entry.second.GetValue(); + uint64_t timeout_value = entry.second.GetValue(); + if (timeout_value > NumericLimits::Maximum()) { + throw InvalidInputException("busy_timeout out of range - must be within valid range for type int"); + } + options.busy_timeout = timeout_value; } else if (StringUtil::CIEquals(entry.first, "journal_mode")) { options.journal_mode = entry.second.ToString(); } } + return make_uniq(db, info.path, std::move(options)); } diff --git a/src/storage/sqlite_catalog.cpp b/src/storage/sqlite_catalog.cpp index 5f53d6c..055a94f 100644 --- a/src/storage/sqlite_catalog.cpp +++ b/src/storage/sqlite_catalog.cpp @@ -12,8 +12,7 @@ namespace duckdb { SQLiteCatalog::SQLiteCatalog(AttachedDatabase &db_p, const string &path, SQLiteOpenOptions options_p) : Catalog(db_p), path(path), options(std::move(options_p)), in_memory(path == ":memory:"), active_in_memory(false), in_memory_db_initialized(false) { if (options.busy_timeout > 0 && options.busy_timeout > NumericLimits::Maximum()) { - throw std::runtime_error("busy_timeout out of range - must be within " - "valid range for type int"); + throw BinderException("busy_timeout out of range - must be within valid range for type int"); } // In-memory database is now opened lazily in GetInMemoryDatabase to support deferred initialization } diff --git a/src/storage/sqlite_schema_entry.cpp b/src/storage/sqlite_schema_entry.cpp index 9657797..e1ee0f4 100644 --- a/src/storage/sqlite_schema_entry.cpp +++ b/src/storage/sqlite_schema_entry.cpp @@ -12,15 +12,20 @@ #include "duckdb/parser/parsed_data/alter_info.hpp" #include "duckdb/parser/parsed_data/alter_table_info.hpp" #include "duckdb/parser/parsed_expression_iterator.hpp" +#include "duckdb/transaction/transaction.hpp" namespace duckdb { SQLiteSchemaEntry::SQLiteSchemaEntry(Catalog &catalog, CreateSchemaInfo &info) : SchemaCatalogEntry(catalog, info) { } -SQLiteTransaction &GetSQLiteTransaction(CatalogTransaction transaction) { +SQLiteTransaction &GetSQLiteTransaction(CatalogTransaction transaction, Catalog &catalog) { if (!transaction.transaction) { - throw InternalException("No transaction!?"); + // This should not happen in normal operation - transactions should be initialized before use. + // Creating one here prevents deadlocks but may indicate missing transaction initialization. + D_ASSERT(false && "Transaction should have been initialized before reaching this point"); + auto &new_transaction = Transaction::Get(transaction.GetContext(), catalog); + return new_transaction.Cast(); } return transaction.transaction->Cast(); } @@ -52,7 +57,7 @@ void SQLiteSchemaEntry::TryDropEntry(ClientContext &context, CatalogType catalog } optional_ptr SQLiteSchemaEntry::CreateTable(CatalogTransaction transaction, BoundCreateTableInfo &info) { - auto &sqlite_transaction = GetSQLiteTransaction(transaction); + auto &sqlite_transaction = GetSQLiteTransaction(transaction, catalog); auto &base_info = info.Base(); auto table_name = base_info.table; if (base_info.on_conflict == OnCreateConflict::REPLACE_ON_CONFLICT) { @@ -143,7 +148,7 @@ optional_ptr SQLiteSchemaEntry::CreateView(CatalogTransaction tran // CREATE OR REPLACE - drop any existing entries first (if any) TryDropEntry(transaction.GetContext(), CatalogType::VIEW_ENTRY, info.view_name); } - auto &sqlite_transaction = GetSQLiteTransaction(transaction); + auto &sqlite_transaction = GetSQLiteTransaction(transaction, catalog); sqlite_transaction.GetDB().Execute(GetCreateViewSQL(info)); return GetEntry(transaction, CatalogType::VIEW_ENTRY, info.view_name); } @@ -298,7 +303,7 @@ void SQLiteSchemaEntry::DropEntry(ClientContext &context, DropInfo &info) { optional_ptr SQLiteSchemaEntry::LookupEntry(CatalogTransaction transaction, const EntryLookupInfo &lookup_info) { - auto &sqlite_transaction = GetSQLiteTransaction(transaction); + auto &sqlite_transaction = GetSQLiteTransaction(transaction, catalog); switch (lookup_info.GetCatalogType()) { case CatalogType::INDEX_ENTRY: case CatalogType::TABLE_ENTRY: diff --git a/src/storage/sqlite_table_entry.cpp b/src/storage/sqlite_table_entry.cpp index fd17c2d..365bb36 100644 --- a/src/storage/sqlite_table_entry.cpp +++ b/src/storage/sqlite_table_entry.cpp @@ -47,7 +47,10 @@ TableFunction SQLiteTableEntry::GetScanFunction(ClientContext &context, unique_p result->table = this; bind_data = std::move(result); - return static_cast(SqliteScanFunction()); + // Use a function-local static to ensure thread-safe initialization + // and avoid static initialization order issues (especially on Windows DLLs) + static SqliteScanFunction scan_function; + return static_cast(scan_function); } TableStorageInfo SQLiteTableEntry::GetStorageInfo(ClientContext &context) { diff --git a/src/storage/sqlite_transaction_manager.cpp b/src/storage/sqlite_transaction_manager.cpp index d800794..c9598c7 100644 --- a/src/storage/sqlite_transaction_manager.cpp +++ b/src/storage/sqlite_transaction_manager.cpp @@ -3,15 +3,24 @@ namespace duckdb { +// Function-local static mutex to avoid Windows DLL initialization issues +mutex& SQLiteTransactionManager::GetTransactionLock() { + static mutex transaction_lock; + return transaction_lock; +} + SQLiteTransactionManager::SQLiteTransactionManager(AttachedDatabase &db_p, SQLiteCatalog &sqlite_catalog) : TransactionManager(db_p), sqlite_catalog(sqlite_catalog) { } Transaction &SQLiteTransactionManager::StartTransaction(ClientContext &context) { auto transaction = make_uniq(sqlite_catalog, *this, context); - transaction->Start(); + // Defer transaction start until first use to avoid potential deadlocks. + // Starting here would trigger DB connection initialization which can block + // for remote files while the MetaTransaction lock is held. + // The transaction will be started lazily in SQLiteTransaction::GetDB() auto &result = *transaction; - lock_guard l(transaction_lock); + lock_guard l(GetTransactionLock()); transactions[result] = std::move(transaction); return result; } @@ -19,7 +28,7 @@ Transaction &SQLiteTransactionManager::StartTransaction(ClientContext &context) ErrorData SQLiteTransactionManager::CommitTransaction(ClientContext &context, Transaction &transaction) { auto &sqlite_transaction = transaction.Cast(); sqlite_transaction.Commit(); - lock_guard l(transaction_lock); + lock_guard l(GetTransactionLock()); transactions.erase(transaction); return ErrorData(); } @@ -27,7 +36,7 @@ ErrorData SQLiteTransactionManager::CommitTransaction(ClientContext &context, Tr void SQLiteTransactionManager::RollbackTransaction(Transaction &transaction) { auto &sqlite_transaction = transaction.Cast(); sqlite_transaction.Rollback(); - lock_guard l(transaction_lock); + lock_guard l(GetTransactionLock()); transactions.erase(transaction); } diff --git a/test/sql/scanner/http_sqlite_04_attach.test b/test/sql/scanner/http_sqlite_04_attach.test new file mode 100644 index 0000000..1eb8850 --- /dev/null +++ b/test/sql/scanner/http_sqlite_04_attach.test @@ -0,0 +1,24 @@ +# name: test/sql/scanner/http_sqlite_04_attach.test +# description: ATTACH remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 5: Attach remote database +statement ok +ATTACH 'https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite' AS http_db (TYPE sqlite); + +# Test 6: Query attached database +query I +SELECT COUNT(*) FROM http_db.Track; +---- +3503 + +statement ok +DETACH http_db; \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_05_complex_queries.test b/test/sql/scanner/http_sqlite_05_complex_queries.test new file mode 100644 index 0000000..d08a4f8 --- /dev/null +++ b/test/sql/scanner/http_sqlite_05_complex_queries.test @@ -0,0 +1,49 @@ +# name: test/sql/scanner/http_sqlite_05_complex_queries.test +# description: Complex queries on attached remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# First attach the database +statement ok +ATTACH 'https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite' AS http_db (TYPE sqlite); + +# Test 7: Complex aggregation on attached database +query TIR +SELECT g.Name as Genre, COUNT(*) as TrackCount, ROUND(AVG(t.Milliseconds/1000.0), 2) as AvgSeconds +FROM http_db.Track t +JOIN http_db.Genre g ON t.GenreId = g.GenreId +GROUP BY g.Name +ORDER BY TrackCount DESC +LIMIT 3; +---- +Rock 1297 283.91 +Latin 579 232.86 +Metal 374 309.75 + +# Test 8: Window functions on attached database +query TTII +SELECT + ar.Name as Artist, + al.Title as Album, + COUNT(t.TrackId) as TrackCount, + ROW_NUMBER() OVER (ORDER BY COUNT(t.TrackId) DESC, ar.Name, al.Title) as AlbumRank +FROM http_db.Artist ar +JOIN http_db.Album al ON ar.ArtistId = al.ArtistId +JOIN http_db.Track t ON al.AlbumId = t.AlbumId +GROUP BY ar.Name, al.Title +ORDER BY TrackCount DESC, ar.Name, al.Title +LIMIT 3; +---- +Lenny Kravitz Greatest Hits 57 1 +Chico Buarque Minha Historia 34 2 +Eric Clapton Unplugged 30 3 + +statement ok +DETACH http_db; \ No newline at end of file