diff --git a/Makefile b/Makefile index e85d93b..fc85aba 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ include extension-ci-tools/makefiles/duckdb_extension.Makefile # Setup the sqlite3 tpch database data/db/tpch.db: release - command -v sqlite3 || (command -v brew && brew install sqlite) || (command -v choco && choco install sqlite -y) || (command -v apt-get && apt-get install -y sqlite3) || (command -v apk && apk add sqlite) || echo "no sqlite3" + command -v sqlite3 || (command -v brew && brew install sqlite) || (command -v choco && choco install sqlite -y) || (command -v apt-get && apt-get install -y sqlite3) || (command -v yum && yum install -y sqlite) || (command -v apk && apk add sqlite) || echo "no sqlite3" ./build/release/$(DUCKDB_PATH) < data/sql/tpch-export.duckdb || tree ./build/release || echo "neither tree not duck" sqlite3 data/db/tpch.db < data/sql/tpch-create.sqlite diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f514eca..3ce9f4f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(storage) add_library( sqlite_ext_library OBJECT - sqlite_db.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp + sqlite_db.cpp sqlite_duckdb_vfs_cache.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp sqlite_storage.cpp sqlite_utils.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ diff --git a/src/include/sqlite_db.hpp b/src/include/sqlite_db.hpp index 8c156b2..1515ce3 100644 --- a/src/include/sqlite_db.hpp +++ b/src/include/sqlite_db.hpp @@ -14,6 +14,7 @@ namespace duckdb { class SQLiteStatement; struct IndexInfo; +class ClientContext; class SQLiteDB { public: @@ -30,7 +31,11 @@ class SQLiteDB { sqlite3 *db; public: + //! Open a SQLite database (local files only) static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); + //! Open a SQLite database with support for both local and remote files (HTTP/HTTPS) + //! @param context Required for remote file access via DuckDB's VFS + static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared = false); bool TryPrepare(const string &query, SQLiteStatement &result); SQLiteStatement Prepare(const string &query); void Execute(const string &query); @@ -53,6 +58,17 @@ class SQLiteDB { bool IsOpen(); void Close(); + +private: + //! Internal implementation methods for opening SQLite databases + static int GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, bool is_remote = false); + static void ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options); + static void HandleOpenError(const string &path, int rc, ClientContext *context = nullptr); + static SQLiteDB OpenWithVFS(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared); + //! Open a local SQLite database file (no remote support) + static SQLiteDB OpenLocal(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); + + static void CheckDBValid(sqlite3 *db); }; } // namespace duckdb diff --git a/src/include/sqlite_duckdb_vfs_cache.hpp b/src/include/sqlite_duckdb_vfs_cache.hpp new file mode 100644 index 0000000..71cf603 --- /dev/null +++ b/src/include/sqlite_duckdb_vfs_cache.hpp @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// sqlite_duckdb_vfs_cache.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/storage/buffer/buffer_handle.hpp" +#include "duckdb/storage/buffer_manager.hpp" +#include "duckdb/storage/caching_file_system.hpp" + +#include "sqlite3.h" + +namespace duckdb { + +class ClientContext; + +// Wrapper around DuckDB's CachingFileSystem for remote SQLite file access. +// Uses DuckDB's caching infrastructure to efficiently handle remote file I/O. +class DuckDBCachedFile { +public: + DuckDBCachedFile(ClientContext &context, const string &path); + ~DuckDBCachedFile(); + + // Read data from the file at the specified offset + int Read(void *buffer, int amount, sqlite3_int64 offset); + // Get the cached file size + sqlite3_int64 get_file_size(); + +private: + // Lazy initialization - defer DuckDB operations until first use + void ensure_initialized(); + + // Adaptive read-ahead constants + static constexpr idx_t MIN_READAHEAD_SIZE = static_cast(1024) * 1024; // 1MB + static constexpr idx_t MAX_READAHEAD_SIZE = static_cast(128) * 1024 * 1024; // 128MB + static constexpr idx_t SEQUENTIAL_THRESHOLD = static_cast(64) * 1024; // 64KB gap tolerance + + ClientContext &context; + const string path; + unique_ptr caching_handle; + bool initialized = false; + + // Adaptive read-ahead state + sqlite3_int64 last_read_offset = -1; // Track last read position + sqlite3_int64 last_read_end = -1; // End of last read (offset + amount) + idx_t current_readahead_size = MIN_READAHEAD_SIZE; // Current read-ahead block size + + // Helper methods for adaptive read-ahead + idx_t calculate_read_ahead_size(sqlite3_int64 offset, int amount) const; + bool is_sequential_read(sqlite3_int64 offset) const; + void update_read_ahead_state(sqlite3_int64 offset, int amount); +}; + +// SQLite Virtual File System (VFS) implementation that uses DuckDB's +// CachingFileSystem for efficient remote SQLite database access. +class SQLiteDuckDBCacheVFS { +public: + // Register the VFS with SQLite (thread-safe, idempotent) + static void Register(ClientContext &context); + // Unregister the VFS when context is destroyed + static void Unregister(ClientContext &context); + // Check if this path should be handled by our VFS (i.e., is it remote?) + static bool CanHandlePath(ClientContext &context, const string &path); + // Get the VFS registration name for a context + static const char *GetVFSNameForContext(ClientContext &context); + // Get the default VFS registration name (for compatibility) + static const char *GetVFSName() { return "duckdb_cache_fs"; } + + // SQLite VFS interface methods (must be public for C callback registration) + static int Open(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags); + static int Delete(sqlite3_vfs *vfs, const char *filename, int sync_dir); + static int Access(sqlite3_vfs *vfs, const char *filename, int flags, int *result); + static int FullPathname(sqlite3_vfs *vfs, const char *filename, int out_size, char *out_buf); + static void *DlOpen(sqlite3_vfs *vfs, const char *filename); + static void DlError(sqlite3_vfs *vfs, int bytes, char *err_msg); + static void (*DlSym(sqlite3_vfs *vfs, void *handle, const char *symbol))(void); + static void DlClose(sqlite3_vfs *vfs, void *handle); + static int Randomness(sqlite3_vfs *vfs, int bytes, char *out); + static int Sleep(sqlite3_vfs *vfs, int microseconds); + static int CurrentTime(sqlite3_vfs *vfs, double *time); + static int GetLastError(sqlite3_vfs *vfs, int bytes, char *err_msg); + + // SQLite file I/O methods (must be public for C callback registration) + static int Close(sqlite3_file *file); + static int Read(sqlite3_file *file, void *buffer, int amount, sqlite3_int64 offset); + static int Write(sqlite3_file *file, const void *buffer, int amount, sqlite3_int64 offset); + static int Truncate(sqlite3_file *file, sqlite3_int64 size); + static int Sync(sqlite3_file *file, int flags); + static int FileSize(sqlite3_file *file, sqlite3_int64 *size); + static int Lock(sqlite3_file *file, int level); + static int Unlock(sqlite3_file *file, int level); + static int CheckReservedLock(sqlite3_file *file, int *result); + static int FileControl(sqlite3_file *file, int op, void *arg); + static int SectorSize(sqlite3_file *file); + static int DeviceCharacteristics(sqlite3_file *file); +}; + +// SQLite file handle structure that wraps our DuckDBCachedFile. +// Memory layout must be compatible with SQLite's expectations. +// This structure is allocated by SQLite and may cross module boundaries. +// We use raw pointers with explicit ownership rules to avoid DLL issues. +struct SQLiteDuckDBCachedFile { + sqlite3_file base; // Must be first member for C compatibility + DuckDBCachedFile *duckdb_file; // Raw pointer - explicitly deleted in Close() +}; + +} // namespace duckdb \ No newline at end of file diff --git a/src/include/storage/sqlite_catalog.hpp b/src/include/storage/sqlite_catalog.hpp index 0bf1ca2..bbc83cf 100644 --- a/src/include/storage/sqlite_catalog.hpp +++ b/src/include/storage/sqlite_catalog.hpp @@ -59,7 +59,7 @@ class SQLiteCatalog : public Catalog { string GetDBPath() override; //! Returns a reference to the in-memory database (if any) - SQLiteDB *GetInMemoryDatabase(); + SQLiteDB *GetInMemoryDatabase(ClientContext &context); //! Release the in-memory database (if there is any) void ReleaseInMemoryDatabase(); @@ -76,6 +76,8 @@ class SQLiteCatalog : public Catalog { mutex in_memory_lock; //! Whether or not there is any active transaction on the in-memory database bool active_in_memory; + //! Whether the in-memory database has been initialized + bool in_memory_db_initialized; }; } // namespace duckdb diff --git a/src/include/storage/sqlite_transaction.hpp b/src/include/storage/sqlite_transaction.hpp index dd7095d..8892688 100644 --- a/src/include/storage/sqlite_transaction.hpp +++ b/src/include/storage/sqlite_transaction.hpp @@ -10,6 +10,8 @@ #include "duckdb/transaction/transaction.hpp" #include "duckdb/common/case_insensitive_map.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/atomic.hpp" #include "sqlite_db.hpp" namespace duckdb { @@ -37,6 +39,10 @@ class SQLiteTransaction : public Transaction { SQLiteDB *db; SQLiteDB owned_db; case_insensitive_map_t> catalog_entries; + + // Atomic flags for thread-safe initialization + atomic started{false}; + atomic db_initialized{false}; }; } // namespace duckdb diff --git a/src/include/storage/sqlite_transaction_manager.hpp b/src/include/storage/sqlite_transaction_manager.hpp index 8f9cafc..38452a1 100644 --- a/src/include/storage/sqlite_transaction_manager.hpp +++ b/src/include/storage/sqlite_transaction_manager.hpp @@ -27,8 +27,10 @@ class SQLiteTransactionManager : public TransactionManager { private: SQLiteCatalog &sqlite_catalog; - mutex transaction_lock; reference_map_t> transactions; + + // Function-local static mutex to avoid Windows DLL initialization issues + static mutex& GetTransactionLock(); }; } // namespace duckdb diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index 4a1f697..233ba41 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -4,8 +4,15 @@ #include "duckdb/storage/table_storage_info.hpp" #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/parser.hpp" +#include "duckdb/common/exception.hpp" +#include "duckdb/common/exception/http_exception.hpp" +#include "duckdb/common/file_open_flags.hpp" +#include "duckdb/common/swap.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/main/client_context.hpp" #include "sqlite_db.hpp" #include "sqlite_stmt.hpp" +#include "sqlite_duckdb_vfs_cache.hpp" namespace duckdb { @@ -21,56 +28,166 @@ SQLiteDB::~SQLiteDB() { Close(); } -SQLiteDB::SQLiteDB(SQLiteDB &&other) noexcept { - std::swap(db, other.db); +SQLiteDB::SQLiteDB(SQLiteDB &&other) noexcept : db(nullptr) { + swap(db, other.db); } SQLiteDB &SQLiteDB::operator=(SQLiteDB &&other) noexcept { - std::swap(db, other.db); + if (this != &other) { + // Close any existing database first + Close(); + swap(db, other.db); + } return *this; } -SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bool is_shared) { - SQLiteDB result; +int SQLiteDB::GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, bool is_remote) { int flags = SQLITE_OPEN_PRIVATECACHE; - if (options.access_mode == AccessMode::READ_ONLY) { + + if (is_remote || options.access_mode == AccessMode::READ_ONLY) { + // Remote databases are always read-only flags |= SQLITE_OPEN_READONLY; } else { flags |= SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; } + if (!is_shared) { // FIXME: we should just make sure we are not re-using the same `sqlite3` // object across threads flags |= SQLITE_OPEN_NOMUTEX; } + flags |= SQLITE_OPEN_EXRESCODE; - auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, nullptr); - if (rc != SQLITE_OK) { - throw std::runtime_error("Unable to open database \"" + path + "\": " + string(sqlite3_errstr(rc))); - } - // default busy time-out of 5 seconds + return flags; +} + +void SQLiteDB::ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options) { if (options.busy_timeout > 0) { if (options.busy_timeout > NumericLimits::Maximum()) { - throw std::runtime_error("busy_timeout out of range - must be within " - "valid range for type int"); + throw BinderException("busy_timeout out of range - must be within valid range for type int"); } - rc = sqlite3_busy_timeout(result.db, int(options.busy_timeout)); + auto rc = sqlite3_busy_timeout(db, int(options.busy_timeout)); if (rc != SQLITE_OK) { - throw std::runtime_error("Failed to set busy timeout"); + throw ConnectionException("Failed to set busy timeout: %s", sqlite3_errmsg(db)); } } +} + +void SQLiteDB::HandleOpenError(const string &path, int rc, ClientContext *context) { + // Map SQLite error codes to user-friendly messages + string error_msg; + int primary_error = rc & 0xFF; // Extract primary error code from extended error code + + // Check if this is a remote file handled by our custom VFS + bool is_remote_file = context && SQLiteDuckDBCacheVFS::CanHandlePath(*context, path); + + switch (primary_error) { + case SQLITE_CANTOPEN: + error_msg = "unable to open database file"; + break; + case SQLITE_PERM: + error_msg = "access permission denied"; + break; + case SQLITE_IOERR: + // For remote files, I/O errors are likely network/connection issues + if (is_remote_file) { + error_msg = "unable to open database file"; + } else { + error_msg = "disk I/O error"; + } + break; + case SQLITE_BUSY: + error_msg = "database is locked"; + break; + case SQLITE_NOMEM: + error_msg = "out of memory"; + break; + case SQLITE_READONLY: + error_msg = "attempt to write a readonly database"; + break; + case SQLITE_CORRUPT: + error_msg = "file is not a database"; + break; + default: + // Fall back to SQLite's error message for other codes + error_msg = sqlite3_errstr(rc); + break; + } + throw ConnectionException("Unable to open database \"%s\": %s", path, error_msg); +} + +// Opens a local SQLite database file using standard SQLite file handling +SQLiteDB SQLiteDB::OpenLocal(const string &path, const SQLiteOpenOptions &options, bool is_shared) { + SQLiteDB result; + int flags = GetOpenFlags(options, is_shared, false); + + auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, nullptr); + if (rc != SQLITE_OK) { + HandleOpenError(path, rc); + } + + ApplyBusyTimeout(result.db, options); + if (!options.journal_mode.empty()) { result.Execute("PRAGMA journal_mode=" + KeywordHelper::EscapeQuotes(options.journal_mode, '\'')); } return result; } +// Opens a remote SQLite database using DuckDB's custom VFS for HTTP/HTTPS support +SQLiteDB SQLiteDB::OpenWithVFS(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared) { + // Register our VFS to handle this remote file + SQLiteDuckDBCacheVFS::Register(context); + + SQLiteDB result; + int flags = GetOpenFlags(options, is_shared, true); + + + auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, SQLiteDuckDBCacheVFS::GetVFSNameForContext(context)); + if (rc != SQLITE_OK) { + HandleOpenError(path, rc, &context); + } + + ApplyBusyTimeout(result.db, options); + + return result; +} + +// Original entry point - local files only (preserved for backward compatibility) +SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bool is_shared) { + return OpenLocal(path, options, is_shared); +} + +// Main entry point for opening SQLite databases - handles both local and remote files +// Remote files (HTTP/HTTPS) use DuckDB's VFS with caching, local files use standard SQLite +SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared) { + if (FileSystem::IsRemoteFile(path)) { + if (SQLiteDuckDBCacheVFS::CanHandlePath(context, path)) { + return OpenWithVFS(path, options, context, is_shared); + } else { + // Path not supported by our VFS - use standard SQLite + return OpenLocal(path, options, is_shared); + } + } else { + // Local files use standard SQLite file handling + return OpenLocal(path, options, is_shared); + } +} + +void SQLiteDB::CheckDBValid(sqlite3 *db) { + if (!db) { + throw InternalException("SQLite database operation called with null database pointer"); + } +} + bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { + CheckDBValid(db); stmt.db = db; if (debug_sqlite_print_queries) { Printer::Print(query + "\n"); } auto rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt.stmt, nullptr); + if (rc != SQLITE_OK) { return false; } @@ -80,20 +197,19 @@ bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { SQLiteStatement SQLiteDB::Prepare(const string &query) { SQLiteStatement stmt; if (!TryPrepare(query, stmt)) { - string error = "Failed to prepare query \"" + query + "\": " + string(sqlite3_errmsg(db)); - throw std::runtime_error(error); + throw BinderException("Failed to prepare query \"%s\": %s", query, sqlite3_errmsg(db)); } return stmt; } void SQLiteDB::Execute(const string &query) { + CheckDBValid(db); if (debug_sqlite_print_queries) { Printer::Print(query + "\n"); } auto rc = sqlite3_exec(db, query.c_str(), nullptr, nullptr, nullptr); if (rc != SQLITE_OK) { - string error = "Failed to execute query \"" + query + "\": " + string(sqlite3_errmsg(db)); - throw std::runtime_error(error); + throw IOException("Failed to execute query \"%s\": %s", query, sqlite3_errmsg(db)); } } @@ -155,7 +271,7 @@ void SQLiteDB::GetIndexInfo(const string &index_name, string &sql, string &table sql = stmt.GetValue(1); return; } - throw InternalException("GetViewInfo - index \"%s\" not found", index_name); + throw InternalException("GetIndexInfo - index \"%s\" not found", index_name); } void SQLiteDB::GetViewInfo(const string &view_name, string &sql) { diff --git a/src/sqlite_duckdb_vfs_cache.cpp b/src/sqlite_duckdb_vfs_cache.cpp new file mode 100644 index 0000000..eb185b0 --- /dev/null +++ b/src/sqlite_duckdb_vfs_cache.cpp @@ -0,0 +1,886 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// sqlite_duckdb_vfs_cache.cpp +// +// +//===----------------------------------------------------------------------===// + +#include "sqlite_duckdb_vfs_cache.hpp" + +#include "duckdb/common/exception.hpp" +#include "duckdb/common/exception/http_exception.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/string_util.hpp" +#include "duckdb/common/unordered_map.hpp" +#include "duckdb/common/operator/cast_operators.hpp" +#include "duckdb/common/re2_regex.hpp" +#include "duckdb/main/client_context.hpp" +#include "duckdb/main/database.hpp" +#include "duckdb/storage/buffer_manager.hpp" +#include "duckdb/common/atomic.hpp" + +#include + +namespace duckdb { + +//===--------------------------------------------------------------------===// +// Concurrency Design +//===--------------------------------------------------------------------===// +// This VFS implementation is designed for safe concurrent access: +// +// VFS Registry (mutex-protected): +// - Each ClientContext gets its own VFS instance with unique name +// - Registration/unregistration operations are thread-safe +// - Mutex protects registry map operations, not file I/O +// +// File Operations (lock-free): +// - Each VFS instance is independent with no shared mutable state +// - File handles contain their own ClientContext pointer +// - DuckDB's CachingFileSystem provides internal synchronization +// +// Lifetime Management: +// - ClientContext MUST outlive all SQLite connections using its VFS +// - VFS automatically unregistered when ClientContext is destroyed +// - Multiple VFS instances share the same ExternalFileCache at DatabaseInstance level +// - Cache sharing is thread-safe through DuckDB's internal locking mechanisms +//===--------------------------------------------------------------------===// + +// Per-context VFS instances for thread safety. +// vfs_name uses sqlite3_malloc for cross-DLL safety. +struct DuckDBVFSWrapper { + sqlite3_vfs base; // Must be first - SQLite VFS structure + ClientContext *context; // The DuckDB context for this VFS + char *vfs_name; // Unique name for this VFS instance (allocated via sqlite3_malloc) + sqlite3_io_methods io_methods; // IO methods for this VFS instance + + // Error context storage for better diagnostics + mutable mutex error_mutex; + string last_error_message; + + ~DuckDBVFSWrapper() noexcept { + // Clean up using SQLite's allocator to match sqlite3_malloc + if (vfs_name) { + sqlite3_free(vfs_name); + vfs_name = nullptr; + } + } + + void set_last_error(const string &error) { + lock_guard lock(error_mutex); + last_error_message = error; + } + + string get_last_error() const { + lock_guard lock(error_mutex); + return last_error_message; + } +}; + +//===--------------------------------------------------------------------===// +// C/C++ Boundary Safety +//===--------------------------------------------------------------------===// +// Template to safely execute C++ code from SQLite's C callbacks. +// Catches all exceptions and converts them to appropriate SQLite error codes. +// This is critical because SQLite is written in C and cannot handle C++ exceptions. +// +// Usage: +// return SafeVFSCall(SQLITE_IOERR, [&]() { +// // C++ code that might throw +// return SQLITE_OK; +// }); +//===--------------------------------------------------------------------===// + +// Forward declaration +struct DuckDBVFSWrapper; + +//===--------------------------------------------------------------------===// +// HTTP Error Mapping +//===--------------------------------------------------------------------===// + +// HTTP error patterns for detection - compile-time constants +static constexpr const char* HTTP_ERROR_PATTERNS[] = { + "\"exception_type\":\"HTTP\"", + "\"exception_type\":\"IO\"", + "404 (Not Found)", + "403 (Forbidden)", + "401 (Unauthorized)", + "500 (Internal Server Error)", + "502 (Bad Gateway)", + "503 (Service Unavailable)", + "Unable to connect to URL", + "Could not establish connection", + "HTTP HEAD to", + "HTTP GET to" +}; + +// Regex-based HTTP status code extraction +static int extract_http_status(const string &error_msg) { + // Comprehensive regex pattern for all HTTP status code formats: + // Group 1: "status_code":"XXX" (JSON) + // Group 2: XXX (Description) (httpfs format) + // Group 3: (HTTP XXX) or HTTP code XXX or HTTP XXX + static duckdb_re2::Regex status_regex( + "\"status_code\":\"(\\d{3})\"|" // JSON format + "(\\d{3})\\s*\\([^)]+\\)|" // "404 (Not Found)" + "\\(?HTTP\\s+(?:code\\s+)?(\\d{3})\\)?" // "(HTTP 404)", "HTTP code 403", "HTTP 500" + ); + + duckdb_re2::Match match; + if (duckdb_re2::RegexSearch(error_msg, match, status_regex)) { + // Check which group captured the status code (groups are 1-indexed) + for (idx_t i = 1; i < match.groups.size(); i++) { + if (!match.groups[i].text.empty()) { + int32_t result; + if (TryCast::Operation(string_t(match.groups[i].text), result)) { + return result; + } + } + } + } + + return 0; // No status code found +} + +// HTTP error detection +static bool is_http_error(const string &error_msg) { + // Use std::any_of with StringUtil::Contains for cleaner, more efficient checking + return std::any_of(std::begin(HTTP_ERROR_PATTERNS), std::end(HTTP_ERROR_PATTERNS), + [&error_msg](const char* pattern) { + return StringUtil::Contains(error_msg, pattern); + }); +} + +// Map HTTP status code to SQLite error code +static int http_status_to_sqlite_error(int http_status) { + switch (http_status) { + case 404: + return SQLITE_CANTOPEN; // SQLite will interpret as "unable to open database file" + case 401: // Unauthorized (auth required) + case 403: + return SQLITE_PERM; // "permission denied" + case 408: + return SQLITE_IOERR_ACCESS; // Request timeout + case 429: + return SQLITE_BUSY; // Too many requests + default: + if (http_status >= 500 && http_status < 600) { + return SQLITE_IOERR; // Server errors + } + return 0; // Unknown/unmapped status + } +} + +template +static T SafeVFSCall(T error_value, Func&& func, DuckDBVFSWrapper *wrapper = nullptr, const char *path = nullptr, const char *method = nullptr) { + try { + return func(); + } catch (const std::exception &e) { + string error_msg = e.what(); + // fprintf(stderr, "DEBUG VFS: Exception caught in %s: %.200s\n", method ? method : "unknown", error_msg.c_str()); + // fprintf(stderr, "DEBUG VFS: IsHTTPError result: %s\n", IsHTTPError(error_msg) ? "true" : "false"); + + // Check if this is an HTTP error + if (is_http_error(error_msg)) { + // Store HTTP error context + if (wrapper) { + string full_error = "HTTP Error: "; + full_error += error_msg; + if (path) { + full_error += " (URL: "; + full_error += path; + full_error += ")"; + } + wrapper->set_last_error(full_error); + } + + // Try to map HTTP status to specific SQLite error + int http_status = extract_http_status(error_msg); + int sqlite_error = http_status_to_sqlite_error(http_status); + // Debug (uncomment for troubleshooting) + // fprintf(stderr, "DEBUG: Error message (first 200 chars): %.200s\n", error_msg.c_str()); + // fprintf(stderr, "DEBUG: HTTP Status: %d, SQLite Error: %d\n", http_status, sqlite_error); + if (sqlite_error != 0) { + return sqlite_error; + } + + // Special case: Network connection failures should be treated as "unable to open" + if (error_msg.find("Unable to connect to URL") != string::npos || + error_msg.find("Could not establish connection") != string::npos) { + return error_value == SQLITE_OK ? SQLITE_CANTOPEN : error_value; + } + + // Default for unmapped HTTP errors (server errors, etc.) + return error_value == SQLITE_OK ? SQLITE_IOERR : error_value; + } + + // Check for specific DuckDB exception types in the message + if (error_msg.find("Permission") != string::npos) { + if (wrapper) { + string full_error = "Permission denied: "; + full_error += error_msg; + if (path) { + full_error += " (Path: "; + full_error += path; + full_error += ")"; + } + wrapper->set_last_error(full_error); + } + return error_value == SQLITE_OK ? SQLITE_PERM : error_value; + } + + // Store generic error context + if (wrapper) { + string full_error = "Error: "; + full_error += error_msg; + if (path) { + full_error += " (Path: "; + full_error += path; + full_error += ")"; + } + wrapper->set_last_error(full_error); + } + + // Default error handling + return error_value; + } catch (...) { + // Unknown exception + if (wrapper) { + wrapper->set_last_error("Unknown error occurred"); + } + return error_value; + } +} + +// Global registry of VFS wrappers to manage their lifetime +// Use function-local statics to ensure proper initialization order on Windows +struct VFSRegistryData { + mutex registry_mutex; + unordered_map> registry; +}; + +static VFSRegistryData& GetVFSRegistryData() { + // Function-local static ensures thread-safe initialization + static VFSRegistryData data; + return data; +} + +// Sector size: minimum atomic write unit for the storage device +static constexpr int SQLITE_SECTOR_SIZE = 4096; + +// Initialize IO methods for a VFS wrapper +static void InitializeIOMethods(sqlite3_io_methods &io_methods) { + memset(&io_methods, 0, sizeof(io_methods)); + + io_methods.iVersion = 1; + io_methods.xClose = SQLiteDuckDBCacheVFS::Close; + io_methods.xRead = SQLiteDuckDBCacheVFS::Read; + io_methods.xWrite = SQLiteDuckDBCacheVFS::Write; + io_methods.xTruncate = SQLiteDuckDBCacheVFS::Truncate; + io_methods.xSync = SQLiteDuckDBCacheVFS::Sync; + io_methods.xFileSize = SQLiteDuckDBCacheVFS::FileSize; + io_methods.xLock = SQLiteDuckDBCacheVFS::Lock; + io_methods.xUnlock = SQLiteDuckDBCacheVFS::Unlock; + io_methods.xCheckReservedLock = SQLiteDuckDBCacheVFS::CheckReservedLock; + io_methods.xFileControl = SQLiteDuckDBCacheVFS::FileControl; + io_methods.xSectorSize = SQLiteDuckDBCacheVFS::SectorSize; + io_methods.xDeviceCharacteristics = SQLiteDuckDBCacheVFS::DeviceCharacteristics; + // Shared memory methods not needed for read-only remote files + io_methods.xShmMap = nullptr; + io_methods.xShmLock = nullptr; + io_methods.xShmBarrier = nullptr; + io_methods.xShmUnmap = nullptr; + io_methods.xFetch = nullptr; + io_methods.xUnfetch = nullptr; +} + +// Get the unique VFS name for a ClientContext +static string get_unique_vfs_name(const ClientContext *context) { + static atomic vfs_counter{0}; + return "duckdb_cache_vfs_" + to_string(vfs_counter.fetch_add(1)); +} + +//===--------------------------------------------------------------------===// +// DuckDBCachedFile Implementation +//===--------------------------------------------------------------------===// + +DuckDBCachedFile::DuckDBCachedFile(ClientContext &context, const string &path) + : context(context), path(path) { + // Defer actual file opening until first use to avoid doing DuckDB operations + // during SQLite VFS callbacks, which might be in a different serialization context +} + +DuckDBCachedFile::~DuckDBCachedFile() { + // Ensure proper cleanup of the caching handle + // The unique_ptr will automatically release the handle, + // but we add this explicit destructor for clarity and + // to enable future debugging/validation if needed + if (caching_handle) { + // Reset explicitly to ensure deterministic cleanup order + caching_handle.reset(); + } +} + +void DuckDBCachedFile::ensure_initialized() { + if (initialized) { + return; + } + + // Configure file flags for optimal caching behavior. + // Remote files use DIRECT_IO to bypass OS caching since DuckDB's + // CachingFileSystem provides its own intelligent block caching. + auto flags = FileFlags::FILE_FLAGS_READ; + if (FileSystem::IsRemoteFile(path)) { + flags |= FileFlags::FILE_FLAGS_DIRECT_IO; + } + + // Open the file through DuckDB's CachingFileSystem. + // The CachingFileSystem provides efficient caching of remote file data, + // though the actual read patterns are determined by our Read implementation. + auto caching_fs = CachingFileSystem::Get(context); + OpenFileInfo file_info(path); + // fprintf(stderr, "DEBUG: About to open file: %s\n", path.c_str()); + caching_handle = caching_fs.OpenFile(file_info, flags); + + + initialized = true; +} + + +int DuckDBCachedFile::Read(void *buffer, int amount, sqlite3_int64 offset) { + // Validate inputs to prevent integer overflow attacks + if (offset < 0 || amount < 0) { + return SQLITE_IOERR_READ; + } + + // Early return for empty reads (SQLite sometimes requests 0 bytes) + if (!buffer || amount == 0) { + return SQLITE_OK; + } + + // Ensure we're initialized before first read + // Let exceptions propagate to SafeVFSCall for unified error handling + ensure_initialized(); + + // Safety check - should never happen in normal operation + if (!caching_handle) { + return SQLITE_IOERR_READ; + } + + // Get current file size from DuckDB (handles validation/caching) + const sqlite3_int64 file_size = static_cast(caching_handle->GetFileSize()); + + // Check if we're reading past EOF + if (offset >= file_size) { + // Reading completely past EOF - zero-fill entire buffer + memset(buffer, 0, amount); + return SQLITE_IOERR_SHORT_READ; + } + + // Calculate how many bytes we can actually read + const sqlite3_int64 available_bytes = file_size - offset; + const int bytes_to_read = (available_bytes < amount) ? static_cast(available_bytes) : amount; + + // Calculate optimal read-ahead size based on access pattern + const idx_t readahead_size = calculate_read_ahead_size(offset, bytes_to_read); + + // Ensure we read at least the requested amount (up to EOF) + idx_t actual_read_size = MaxValue(static_cast(bytes_to_read), readahead_size); + + // Don't read beyond file end (use safe arithmetic to prevent overflow) + // We already know offset < file_size from the check above + const sqlite3_int64 remaining_bytes = file_size - offset; + if (static_cast(actual_read_size) > remaining_bytes) { + actual_read_size = static_cast(remaining_bytes); + } + + // Use DuckDB's CachingFileSystem with adaptive read-ahead + data_ptr_t read_buffer = nullptr; + auto buffer_handle = caching_handle->Read(read_buffer, actual_read_size, offset); + + // Validate read buffer before copying + if (!read_buffer) { + return SQLITE_IOERR_READ; + } + + // Copy the data we read + memcpy(buffer, read_buffer, bytes_to_read); + + // Note: SQLite validates the database header itself when opening the database, + // so we don't need to duplicate that validation here. + + // If we read less than requested, zero-fill the remainder + if (bytes_to_read < amount) { + memset(static_cast(buffer) + bytes_to_read, 0, amount - bytes_to_read); + } + + // Update read-ahead state after successful read + update_read_ahead_state(offset, bytes_to_read); + + // Return appropriate code based on whether we satisfied the full request + return (bytes_to_read < amount) ? SQLITE_IOERR_SHORT_READ : SQLITE_OK; +} + +sqlite3_int64 DuckDBCachedFile::get_file_size() { + try { + ensure_initialized(); + return static_cast(caching_handle->GetFileSize()); + } catch (...) { + return -1; + } +} + +idx_t DuckDBCachedFile::calculate_read_ahead_size(sqlite3_int64 offset, int amount) const { + // First read or non-sequential access - use minimum size + if (last_read_offset == -1 || !is_sequential_read(offset)) { + return MIN_READAHEAD_SIZE; + } + + // Sequential read - double the current size up to maximum + const idx_t next_size = current_readahead_size * 2; + return MinValue(next_size, MAX_READAHEAD_SIZE); +} + +bool DuckDBCachedFile::is_sequential_read(sqlite3_int64 offset) const { + // Consider sequential if: + // 1. Reading from exactly where last read ended, OR + // 2. Reading within SEQUENTIAL_THRESHOLD of where last read ended + return (offset >= last_read_end) && + (offset <= last_read_end + static_cast(SEQUENTIAL_THRESHOLD)); +} + +void DuckDBCachedFile::update_read_ahead_state(sqlite3_int64 offset, int amount) { + // Update read-ahead size based on access pattern + if (is_sequential_read(offset)) { + // Sequential read - grow read-ahead size + current_readahead_size = MinValue(current_readahead_size * 2, MAX_READAHEAD_SIZE); + } else { + // Non-sequential read - reset to minimum + current_readahead_size = MIN_READAHEAD_SIZE; + } + + // Update position tracking + last_read_offset = offset; + last_read_end = offset + amount; +} + + +//===--------------------------------------------------------------------===// +// SQLiteDuckDBCacheVFS Implementation +//===--------------------------------------------------------------------===// + +bool SQLiteDuckDBCacheVFS::CanHandlePath(ClientContext &context, const string &path) { + return FileSystem::IsRemoteFile(path); +} + +void SQLiteDuckDBCacheVFS::Register(ClientContext &context) { + auto& registry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + // Check if this context already has a VFS registered + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + // Already registered for this context + return; + } + + // Find SQLite's default VFS to delegate some operations + sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); + if (!default_vfs) { + throw InternalException("Failed to find default SQLite VFS - SQLite may not be properly initialized"); + } + + // Create a new VFS wrapper for this context + auto wrapper = make_uniq(); + wrapper->context = &context; + + // Allocate VFS name using SQLite's allocator for DLL safety + const string temp_name = get_unique_vfs_name(&context); + wrapper->vfs_name = static_cast(sqlite3_malloc64(temp_name.length() + 1)); + if (!wrapper->vfs_name) { + throw InternalException("Failed to allocate memory for VFS name"); + } + memcpy(wrapper->vfs_name, temp_name.c_str(), temp_name.length() + 1); + + // Initialize the IO methods for this VFS instance + InitializeIOMethods(wrapper->io_methods); + + // Initialize the VFS structure + memset(&wrapper->base, 0, sizeof(wrapper->base)); + wrapper->base.iVersion = 1; + wrapper->base.szOsFile = sizeof(SQLiteDuckDBCachedFile); + wrapper->base.mxPathname = default_vfs->mxPathname; + wrapper->base.zName = wrapper->vfs_name; // Now using C-style string + wrapper->base.pAppData = wrapper.get(); // Store pointer to wrapper + + // Configure VFS methods + wrapper->base.xOpen = Open; + wrapper->base.xDelete = Delete; + wrapper->base.xAccess = Access; + wrapper->base.xFullPathname = FullPathname; + wrapper->base.xDlOpen = DlOpen; + wrapper->base.xDlError = DlError; + wrapper->base.xDlSym = DlSym; + wrapper->base.xDlClose = DlClose; + wrapper->base.xRandomness = Randomness; + wrapper->base.xSleep = Sleep; + wrapper->base.xCurrentTime = CurrentTime; + wrapper->base.xGetLastError = GetLastError; + + // Register this VFS with SQLite + int rc = sqlite3_vfs_register(&wrapper->base, 0); + if (rc != SQLITE_OK) { + throw InternalException("Failed to register DuckDB Cache VFS: %s", sqlite3_errstr(rc)); + } + + // Store in registry - wrapper ownership transfers to registry + registry_data.registry[&context] = std::move(wrapper); +} + +// Unregister the VFS associated with a ClientContext when it's being destroyed. +// This ensures proper cleanup of VFS resources. +void SQLiteDuckDBCacheVFS::Unregister(ClientContext &context) { + auto& registry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + // Unregister from SQLite + sqlite3_vfs_unregister(&it->second->base); + // Remove from registry + registry_data.registry.erase(it); + } +} + +// Get the unique VFS name associated with a specific ClientContext. +// Returns the default VFS name if no specific VFS is registered for this context. +const char *SQLiteDuckDBCacheVFS::GetVFSNameForContext(ClientContext &context) { + auto& registry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end() && it->second->vfs_name) { + return it->second->vfs_name; // Return the C-style string directly + } + + // Fallback to default name if not found (shouldn't happen) + return GetVFSName(); +} + +//===--------------------------------------------------------------------===// +// VFS Methods +//===--------------------------------------------------------------------===// + +// Macro to delegate VFS operations to SQLite's default VFS implementation. +// Used for system-level operations (randomness, sleep, time) that don't involve +// file I/O and thus don't need special handling for remote files. +#define DELEGATE_TO_DEFAULT_VFS(method_name, ...) \ + sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); \ + if (default_vfs && default_vfs->method_name) { \ + return default_vfs->method_name(default_vfs, __VA_ARGS__); \ + } \ + return SQLITE_OK; + +int SQLiteDuckDBCacheVFS::Open(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags) { + // Get wrapper for error context + auto *wrapper = vfs && vfs->pAppData ? static_cast(vfs->pAppData) : nullptr; + + return SafeVFSCall(SQLITE_CANTOPEN, [&]() { + // Validate parameters and ensure read-only access + if (!vfs || !filename || !file || (flags & SQLITE_OPEN_READONLY) == 0) { + return SQLITE_CANTOPEN; + } + + // Ensure SQLite allocated enough space for our file structure + if (vfs->szOsFile < static_cast(sizeof(SQLiteDuckDBCachedFile))) { + return SQLITE_CANTOPEN; + } + + auto *duckdb_file = reinterpret_cast(file); + + // Retrieve the VFS wrapper and context from pAppData + if (!vfs->pAppData) { + return SQLITE_CANTOPEN; + } + + // wrapper is already declared in outer scope + ClientContext *context = wrapper->context; + + // Defensive check: Validate context is still valid + if (!context || !context->db) { + return SQLITE_CANTOPEN; + } + + // Initialize the structure members properly + duckdb_file->base.pMethods = &wrapper->io_methods; + duckdb_file->duckdb_file = nullptr; + + // Create the DuckDB file handle with proper exception handling + try { + duckdb_file->duckdb_file = new DuckDBCachedFile(*context, filename); + } catch (...) { + // Clean up on failure + duckdb_file->base.pMethods = nullptr; + duckdb_file->duckdb_file = nullptr; + return SQLITE_CANTOPEN; + } + + // Don't validate SQLite header here - defer until first read + // to avoid triggering DuckDB operations in VFS callbacks + + if (out_flags) { + *out_flags = flags; + } + + return SQLITE_OK; + }, wrapper, filename, "xOpen"); +} + +int SQLiteDuckDBCacheVFS::Delete(sqlite3_vfs *vfs, const char *filename, int sync_dir) { + // Remote files cannot be deleted through this VFS + return SQLITE_IOERR_DELETE; +} + +int SQLiteDuckDBCacheVFS::Access(sqlite3_vfs *vfs, const char *filename, int flags, int *result) { + auto *wrapper = vfs && vfs->pAppData ? static_cast(vfs->pAppData) : nullptr; + + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!filename || !result) { + return SQLITE_IOERR; + } + + // Initialize result to safe default + *result = 0; + + // For remote files, we need to handle journal/WAL file checks properly. + // SQLite uses Access() to check for the existence of journal and WAL files + // to determine if a database might have uncommitted transactions. + + if (flags == SQLITE_ACCESS_EXISTS) { + // Check if this is a journal or WAL file by examining the suffix + const string file_path(filename); + bool is_journal = false; + bool is_wal = false; + + // Check for journal file suffixes + if (file_path.size() > 8) { + string suffix = file_path.substr(file_path.size() - 8); + if (suffix == "-journal" || suffix == "-wal") { + is_journal = (suffix == "-journal"); + is_wal = (suffix == "-wal"); + } + } + + if (is_journal || is_wal) { + // For journal/WAL files, we need to check if they actually exist + // This is critical for SQLite to properly detect hot journals + if (!vfs->pAppData) { + *result = 0; + return SQLITE_OK; + } + + auto *wrapper = static_cast(vfs->pAppData); + ClientContext *context = wrapper->context; + + if (!context || !context->db) { + *result = 0; + return SQLITE_OK; + } + + // Try to check file existence through DuckDB's filesystem + try { + auto &fs = context->db->GetFileSystem(); + *result = fs.FileExists(file_path) ? 1 : 0; + } catch (...) { + // If we can't check, assume it doesn't exist + *result = 0; + } + } else { + // For the main database file, return 0 to let SQLite try to open it + // This avoids triggering DuckDB operations in the wrong context + *result = 0; + } + } else if (flags == SQLITE_ACCESS_READWRITE) { + // Remote files are always read-only + *result = 0; + } else if (flags == SQLITE_ACCESS_READ) { + // We can read remote files, but defer actual check to open + *result = 0; + } else { + // Unknown access type + *result = 0; + } + + return SQLITE_OK; + }, wrapper, filename); +} + +int SQLiteDuckDBCacheVFS::FullPathname(sqlite3_vfs *vfs, const char *filename, int out_size, char *out_buf) { + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!filename || !out_buf || out_size <= 0) { + return SQLITE_IOERR; + } + + // Remote paths are already absolute URLs - return as-is + strncpy(out_buf, filename, out_size - 1); + out_buf[out_size - 1] = '\0'; + return SQLITE_OK; + }); +} + +// These methods don't need special handling - delegate to default VFS +int SQLiteDuckDBCacheVFS::Randomness(sqlite3_vfs *vfs, int bytes, char *out) { + DELEGATE_TO_DEFAULT_VFS(xRandomness, bytes, out); +} + +int SQLiteDuckDBCacheVFS::Sleep(sqlite3_vfs *vfs, int microseconds) { + DELEGATE_TO_DEFAULT_VFS(xSleep, microseconds); +} + +int SQLiteDuckDBCacheVFS::CurrentTime(sqlite3_vfs *vfs, double *time) { + DELEGATE_TO_DEFAULT_VFS(xCurrentTime, time); +} + +// Dynamic library operations are not supported for remote files +void *SQLiteDuckDBCacheVFS::DlOpen(sqlite3_vfs *vfs, const char *filename) { + return nullptr; +} + +void SQLiteDuckDBCacheVFS::DlError(sqlite3_vfs *vfs, int bytes, char *err_msg) { + try { + if (err_msg && bytes > 0) { + strncpy(err_msg, "Dynamic loading not supported for remote files", bytes - 1); + err_msg[bytes - 1] = '\0'; + } + } catch (...) { + // Best effort - if we can't even set the error message, just return + if (err_msg && bytes > 0) { + err_msg[0] = '\0'; + } + } +} + +void (*SQLiteDuckDBCacheVFS::DlSym(sqlite3_vfs *vfs, void *handle, const char *symbol))(void) { + return nullptr; +} + +void SQLiteDuckDBCacheVFS::DlClose(sqlite3_vfs *vfs, void *handle) { + // No-op - dynamic libraries not supported +} + +int SQLiteDuckDBCacheVFS::GetLastError(sqlite3_vfs *vfs, int bytes, char *err_msg) { + return SafeVFSCall(0, [&]() { + if (!vfs || !vfs->pAppData || !err_msg || bytes <= 0) { + return 0; + } + + auto *wrapper = static_cast(vfs->pAppData); + const string error = wrapper->get_last_error(); + + if (error.empty()) { + err_msg[0] = '\0'; + return 0; + } + + // Copy error message to buffer + strncpy(err_msg, error.c_str(), bytes - 1); + err_msg[bytes - 1] = '\0'; + + return static_cast(error.length()); + }); +} + +//===--------------------------------------------------------------------===// +// File Methods +//===--------------------------------------------------------------------===// + +int SQLiteDuckDBCacheVFS::Close(sqlite3_file *file) { + return SafeVFSCall(SQLITE_OK, [&]() { + if (file) { + auto *duckdb_file = reinterpret_cast(file); + // Explicitly delete the raw pointer + delete duckdb_file->duckdb_file; + duckdb_file->duckdb_file = nullptr; + } + return SQLITE_OK; + }); +} + +int SQLiteDuckDBCacheVFS::Read(sqlite3_file *file, void *buffer, int amount, sqlite3_int64 offset) { + return SafeVFSCall(SQLITE_IOERR_READ, [&]() { + if (!file || !buffer) { + return SQLITE_IOERR_READ; + } + + auto *duckdb_file = reinterpret_cast(file); + if (!duckdb_file->duckdb_file) { + return SQLITE_IOERR_READ; + } + + return duckdb_file->duckdb_file->Read(buffer, amount, offset); + }); +} + +int SQLiteDuckDBCacheVFS::FileSize(sqlite3_file *file, sqlite3_int64 *size) { + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!file || !size) { + return SQLITE_IOERR; + } + + auto *duckdb_file = reinterpret_cast(file); + if (!duckdb_file->duckdb_file) { + return SQLITE_IOERR; + } + + *size = duckdb_file->duckdb_file->get_file_size(); + return SQLITE_OK; + }); +} + +// Write operations return SQLITE_READONLY since remote files are read-only +int SQLiteDuckDBCacheVFS::Write(sqlite3_file *file, const void *buffer, int amount, sqlite3_int64 offset) { + return SQLITE_READONLY; +} + +int SQLiteDuckDBCacheVFS::Truncate(sqlite3_file *file, sqlite3_int64 size) { + return SQLITE_READONLY; +} + +int SQLiteDuckDBCacheVFS::Sync(sqlite3_file *file, int flags) { + // No-op for read-only files + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::Lock(sqlite3_file *file, int level) { + // Remote files don't need locking - they're read-only and immutable + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::Unlock(sqlite3_file *file, int level) { + // Remote files don't need locking - they're read-only and immutable + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::CheckReservedLock(sqlite3_file *file, int *result) { + if (result) { + *result = 0; + } + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::FileControl(sqlite3_file *file, int op, void *arg) { + // No special file control operations are implemented + return SQLITE_NOTFOUND; +} + +int SQLiteDuckDBCacheVFS::SectorSize(sqlite3_file *file) { + return SQLITE_SECTOR_SIZE; +} + +int SQLiteDuckDBCacheVFS::DeviceCharacteristics(sqlite3_file *file) { + // Remote files are immutable - they cannot be modified + return SQLITE_IOCAP_IMMUTABLE; +} + +} // namespace duckdb \ No newline at end of file diff --git a/src/sqlite_extension.cpp b/src/sqlite_extension.cpp index f85328f..92b9310 100644 --- a/src/sqlite_extension.cpp +++ b/src/sqlite_extension.cpp @@ -4,6 +4,7 @@ #include "duckdb.hpp" #include "sqlite_db.hpp" +#include "sqlite_duckdb_vfs_cache.hpp" #include "sqlite_scanner.hpp" #include "sqlite_storage.hpp" #include "sqlite_scanner_extension.hpp" @@ -11,6 +12,7 @@ #include "duckdb/catalog/catalog.hpp" #include "duckdb/main/extension_util.hpp" #include "duckdb/parser/parsed_data/create_table_function_info.hpp" +#include "duckdb/planner/extension_callback.hpp" using namespace duckdb; @@ -20,15 +22,32 @@ static void SetSqliteDebugQueryPrint(ClientContext &context, SetScope scope, Val SQLiteDB::DebugSetPrintQueries(BooleanValue::Get(parameter)); } +// Cleanup callback for VFS when connection is closed +class SQLiteVFSCleanupCallback : public ExtensionCallback { +public: + void OnConnectionClosed(ClientContext &context) override { + // Unregister the VFS for this context if it was registered + SQLiteDuckDBCacheVFS::Unregister(context); + } +}; + static void LoadInternal(DatabaseInstance &db) { - SqliteScanFunction sqlite_fun; - ExtensionUtil::RegisterFunction(db, sqlite_fun); + // Create function instances inline like built-in functions do + // This avoids any static storage issues + { + SqliteScanFunction sqlite_scan; + ExtensionUtil::RegisterFunction(db, sqlite_scan); + } - SqliteAttachFunction attach_func; - ExtensionUtil::RegisterFunction(db, attach_func); + { + SqliteAttachFunction sqlite_attach; + ExtensionUtil::RegisterFunction(db, sqlite_attach); + } - SQLiteQueryFunction query_func; - ExtensionUtil::RegisterFunction(db, query_func); + { + SQLiteQueryFunction sqlite_query; + ExtensionUtil::RegisterFunction(db, sqlite_query); + } auto &config = DBConfig::GetConfig(db); config.AddExtensionOption("sqlite_all_varchar", "Load all SQLite columns as VARCHAR columns", LogicalType::BOOLEAN); @@ -36,7 +55,14 @@ static void LoadInternal(DatabaseInstance &db) { config.AddExtensionOption("sqlite_debug_show_queries", "DEBUG SETTING: print all queries sent to SQLite to stdout", LogicalType::BOOLEAN, Value::BOOLEAN(false), SetSqliteDebugQueryPrint); - config.storage_extensions["sqlite_scanner"] = make_uniq(); + if (config.storage_extensions.find("sqlite_scanner") == config.storage_extensions.end()) { + config.storage_extensions["sqlite_scanner"] = make_uniq(); + } + + // Register cleanup callback for VFS + config.extension_callbacks.push_back(make_uniq()); + + // HTTP SQLite support is handled entirely by VFS through DuckDB's CachingFileSystem } void SqliteScannerExtension::Load(DuckDB &db) { @@ -52,6 +78,8 @@ DUCKDB_EXTENSION_API const char *sqlite_scanner_version() { } DUCKDB_EXTENSION_API void sqlite_scanner_storage_init(DBConfig &config) { - config.storage_extensions["sqlite_scanner"] = make_uniq(); + if (config.storage_extensions.find("sqlite_scanner") == config.storage_extensions.end()) { + config.storage_extensions["sqlite_scanner"] = make_uniq(); + } } } diff --git a/src/sqlite_scanner.cpp b/src/sqlite_scanner.cpp index eb8b5ff..c5e9708 100644 --- a/src/sqlite_scanner.cpp +++ b/src/sqlite_scanner.cpp @@ -55,7 +55,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction SQLiteStatement stmt; SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - db = SQLiteDB::Open(result->file_name, options); + db = SQLiteDB::Open(result->file_name, options, context); ColumnList columns; vector> constraints; @@ -72,7 +72,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction } if (names.empty()) { - throw std::runtime_error("no columns for table " + result->table_name); + throw BinderException("Table \"%s\" has no columns", result->table_name); } if (!db.GetRowIdInfo(result->table_name, result->row_id_info)) { @@ -81,6 +81,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction result->names = names; result->types = return_types; + result->global_db = nullptr; return std::move(result); } @@ -96,7 +97,7 @@ static void SqliteInitInternal(ClientContext &context, const SqliteBindData &bin if (!local_state.db) { SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - local_state.owned_db = SQLiteDB::Open(bind_data.file_name.c_str(), options); + local_state.owned_db = SQLiteDB::Open(bind_data.file_name.c_str(), options, context); local_state.db = &local_state.owned_db; } string sql; @@ -313,7 +314,7 @@ static void SqliteScan(ClientContext &context, TableFunctionInput &data, DataChu out_vec, (const char *)sqlite3_value_blob(val), sqlite3_value_bytes(val)); break; default: - throw std::runtime_error(out_vec.GetType().ToString()); + throw InternalException("Unsupported type \"%s\" for SQLite value conversion", out_vec.GetType().ToString()); } } out_idx++; @@ -365,6 +366,23 @@ struct AttachFunctionData : public TableFunctionData { bool finished = false; bool overwrite = false; string file_name = ""; + + // Override virtual methods from FunctionData + unique_ptr Copy() const override { + auto result = make_uniq(); + result->finished = finished; + result->overwrite = overwrite; + result->file_name = file_name; + result->column_ids = column_ids; + return std::move(result); + } + + bool Equals(const FunctionData &other) const override { + auto &other_attach = other.Cast(); + return finished == other_attach.finished && + overwrite == other_attach.overwrite && + file_name == other_attach.file_name; + } }; static unique_ptr AttachBind(ClientContext &context, TableFunctionBindInput &input, @@ -392,7 +410,7 @@ static void AttachFunction(ClientContext &context, TableFunctionInput &data_p, D SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - SQLiteDB db = SQLiteDB::Open(data.file_name, options); + SQLiteDB db = SQLiteDB::Open(data.file_name, options, context); auto dconn = Connection(context.db->GetDatabase(context)); { auto tables = db.GetTables(); diff --git a/src/sqlite_storage.cpp b/src/sqlite_storage.cpp index 09d212d..46150c9 100644 --- a/src/sqlite_storage.cpp +++ b/src/sqlite_storage.cpp @@ -9,6 +9,7 @@ #include "duckdb/transaction/transaction_manager.hpp" #include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp" #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" +#include "duckdb/common/limits.hpp" namespace duckdb { @@ -19,11 +20,16 @@ static unique_ptr SQLiteAttach(StorageExtensionInfo *storage_info, Clie options.access_mode = access_mode; for (auto &entry : info.options) { if (StringUtil::CIEquals(entry.first, "busy_timeout")) { - options.busy_timeout = entry.second.GetValue(); + uint64_t timeout_value = entry.second.GetValue(); + if (timeout_value > NumericLimits::Maximum()) { + throw InvalidInputException("busy_timeout out of range - must be within valid range for type int"); + } + options.busy_timeout = timeout_value; } else if (StringUtil::CIEquals(entry.first, "journal_mode")) { options.journal_mode = entry.second.ToString(); } } + return make_uniq(db, info.path, std::move(options)); } diff --git a/src/storage/sqlite_catalog.cpp b/src/storage/sqlite_catalog.cpp index a228510..055a94f 100644 --- a/src/storage/sqlite_catalog.cpp +++ b/src/storage/sqlite_catalog.cpp @@ -5,14 +5,16 @@ #include "storage/sqlite_schema_entry.hpp" #include "storage/sqlite_transaction.hpp" #include "duckdb/common/exception/transaction_exception.hpp" +#include "duckdb/common/limits.hpp" namespace duckdb { SQLiteCatalog::SQLiteCatalog(AttachedDatabase &db_p, const string &path, SQLiteOpenOptions options_p) - : Catalog(db_p), path(path), options(std::move(options_p)), in_memory(path == ":memory:"), active_in_memory(false) { - if (InMemory()) { - in_memory_db = SQLiteDB::Open(path, options, true); + : Catalog(db_p), path(path), options(std::move(options_p)), in_memory(path == ":memory:"), active_in_memory(false), in_memory_db_initialized(false) { + if (options.busy_timeout > 0 && options.busy_timeout > NumericLimits::Maximum()) { + throw BinderException("busy_timeout out of range - must be within valid range for type int"); } + // In-memory database is now opened lazily in GetInMemoryDatabase to support deferred initialization } SQLiteCatalog::~SQLiteCatalog() { @@ -52,11 +54,18 @@ string SQLiteCatalog::GetDBPath() { return path; } -SQLiteDB *SQLiteCatalog::GetInMemoryDatabase() { +SQLiteDB *SQLiteCatalog::GetInMemoryDatabase(ClientContext &context) { if (!InMemory()) { throw InternalException("GetInMemoryDatabase() called on a non-in-memory database"); } lock_guard l(in_memory_lock); + + // Initialize the in-memory database on first access + if (!in_memory_db_initialized) { + in_memory_db = SQLiteDB::Open(path, options, context, true); + in_memory_db_initialized = true; + } + if (active_in_memory) { throw TransactionException("Only a single transaction can be active on an " "in-memory SQLite database at a time"); diff --git a/src/storage/sqlite_schema_entry.cpp b/src/storage/sqlite_schema_entry.cpp index 9657797..e1ee0f4 100644 --- a/src/storage/sqlite_schema_entry.cpp +++ b/src/storage/sqlite_schema_entry.cpp @@ -12,15 +12,20 @@ #include "duckdb/parser/parsed_data/alter_info.hpp" #include "duckdb/parser/parsed_data/alter_table_info.hpp" #include "duckdb/parser/parsed_expression_iterator.hpp" +#include "duckdb/transaction/transaction.hpp" namespace duckdb { SQLiteSchemaEntry::SQLiteSchemaEntry(Catalog &catalog, CreateSchemaInfo &info) : SchemaCatalogEntry(catalog, info) { } -SQLiteTransaction &GetSQLiteTransaction(CatalogTransaction transaction) { +SQLiteTransaction &GetSQLiteTransaction(CatalogTransaction transaction, Catalog &catalog) { if (!transaction.transaction) { - throw InternalException("No transaction!?"); + // This should not happen in normal operation - transactions should be initialized before use. + // Creating one here prevents deadlocks but may indicate missing transaction initialization. + D_ASSERT(false && "Transaction should have been initialized before reaching this point"); + auto &new_transaction = Transaction::Get(transaction.GetContext(), catalog); + return new_transaction.Cast(); } return transaction.transaction->Cast(); } @@ -52,7 +57,7 @@ void SQLiteSchemaEntry::TryDropEntry(ClientContext &context, CatalogType catalog } optional_ptr SQLiteSchemaEntry::CreateTable(CatalogTransaction transaction, BoundCreateTableInfo &info) { - auto &sqlite_transaction = GetSQLiteTransaction(transaction); + auto &sqlite_transaction = GetSQLiteTransaction(transaction, catalog); auto &base_info = info.Base(); auto table_name = base_info.table; if (base_info.on_conflict == OnCreateConflict::REPLACE_ON_CONFLICT) { @@ -143,7 +148,7 @@ optional_ptr SQLiteSchemaEntry::CreateView(CatalogTransaction tran // CREATE OR REPLACE - drop any existing entries first (if any) TryDropEntry(transaction.GetContext(), CatalogType::VIEW_ENTRY, info.view_name); } - auto &sqlite_transaction = GetSQLiteTransaction(transaction); + auto &sqlite_transaction = GetSQLiteTransaction(transaction, catalog); sqlite_transaction.GetDB().Execute(GetCreateViewSQL(info)); return GetEntry(transaction, CatalogType::VIEW_ENTRY, info.view_name); } @@ -298,7 +303,7 @@ void SQLiteSchemaEntry::DropEntry(ClientContext &context, DropInfo &info) { optional_ptr SQLiteSchemaEntry::LookupEntry(CatalogTransaction transaction, const EntryLookupInfo &lookup_info) { - auto &sqlite_transaction = GetSQLiteTransaction(transaction); + auto &sqlite_transaction = GetSQLiteTransaction(transaction, catalog); switch (lookup_info.GetCatalogType()) { case CatalogType::INDEX_ENTRY: case CatalogType::TABLE_ENTRY: diff --git a/src/storage/sqlite_table_entry.cpp b/src/storage/sqlite_table_entry.cpp index fd17c2d..365bb36 100644 --- a/src/storage/sqlite_table_entry.cpp +++ b/src/storage/sqlite_table_entry.cpp @@ -47,7 +47,10 @@ TableFunction SQLiteTableEntry::GetScanFunction(ClientContext &context, unique_p result->table = this; bind_data = std::move(result); - return static_cast(SqliteScanFunction()); + // Use a function-local static to ensure thread-safe initialization + // and avoid static initialization order issues (especially on Windows DLLs) + static SqliteScanFunction scan_function; + return static_cast(scan_function); } TableStorageInfo SQLiteTableEntry::GetStorageInfo(ClientContext &context) { diff --git a/src/storage/sqlite_transaction.cpp b/src/storage/sqlite_transaction.cpp index 4dc0296..a6c17a3 100644 --- a/src/storage/sqlite_transaction.cpp +++ b/src/storage/sqlite_transaction.cpp @@ -15,16 +15,42 @@ namespace duckdb { -SQLiteTransaction::SQLiteTransaction(SQLiteCatalog &sqlite_catalog, TransactionManager &manager, ClientContext &context) - : Transaction(manager, context), sqlite_catalog(sqlite_catalog) { - if (sqlite_catalog.InMemory()) { - // in-memory database - get a reference to the in-memory connection - db = sqlite_catalog.GetInMemoryDatabase(); - } else { - // on-disk database - open a new database connection - owned_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, true); - db = &owned_db; +//===--------------------------------------------------------------------===// +// Per-Database Mutex Registry +//===--------------------------------------------------------------------===// +// Instead of a global mutex, we use per-database mutexes to reduce contention. +// This allows concurrent access to different databases while maintaining +// thread safety for each individual database. +//===--------------------------------------------------------------------===// + +struct DatabaseMutexRegistry { + mutex registry_mutex; // Protects the registry itself + unordered_map> database_mutexes; + + mutex& GetDatabaseMutex(const string &path) { + lock_guard lock(registry_mutex); + auto it = database_mutexes.find(path); + if (it == database_mutexes.end()) { + database_mutexes[path] = make_uniq(); + return *database_mutexes[path]; + } + return *it->second; } +}; + +static DatabaseMutexRegistry& GetMutexRegistry() { + // Function-local static ensures thread-safe initialization + static DatabaseMutexRegistry registry; + return registry; +} + +SQLiteTransaction::SQLiteTransaction(SQLiteCatalog &sqlite_catalog, TransactionManager &manager, ClientContext &context) + : Transaction(manager, context), sqlite_catalog(sqlite_catalog), db(nullptr) { + + // Database connection and transaction start are deferred to prevent potential deadlocks. + // Opening SQLite connections for remote files can trigger HTTP requests and caching + // operations while the MetaTransaction lock is held, which could cause deadlocks. + // Both connection and transaction are initialized lazily in GetDB() when first needed. } SQLiteTransaction::~SQLiteTransaction() { @@ -32,16 +58,53 @@ SQLiteTransaction::~SQLiteTransaction() { } void SQLiteTransaction::Start() { - db->Execute("BEGIN TRANSACTION"); + + if (!started.load(std::memory_order_acquire)) { + GetDB(); // This will handle both connection and transaction start + } } void SQLiteTransaction::Commit() { - db->Execute("COMMIT"); + GetDB().Execute("COMMIT"); } void SQLiteTransaction::Rollback() { - db->Execute("ROLLBACK"); + GetDB().Execute("ROLLBACK"); } SQLiteDB &SQLiteTransaction::GetDB() { + // Fast path: check if already initialized (with memory ordering) + if (db_initialized.load(std::memory_order_acquire)) { + return *db; + } + + // Slow path: need to initialize + // Get per-database mutex to reduce contention + auto &database_mutex = GetMutexRegistry().GetDatabaseMutex(sqlite_catalog.path); + lock_guard lock(database_mutex); + + // Check again after acquiring lock (double-checked locking with proper atomics) + if (!db_initialized.load(std::memory_order_relaxed)) { + // Initialize database connection + if (!db) { + if (sqlite_catalog.InMemory()) { + // in-memory database - get a reference to the in-memory connection + db = sqlite_catalog.GetInMemoryDatabase(*context.lock()); + } else { + // on-disk/remote database - open a new database connection + owned_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, *context.lock(), true); + db = &owned_db; + } + } + + // Start transaction if not already started + if (!started.load(std::memory_order_relaxed)) { + db->Execute("BEGIN TRANSACTION"); + started.store(true, std::memory_order_relaxed); + } + + // Mark as initialized with release semantics to ensure all writes are visible + db_initialized.store(true, std::memory_order_release); + } + return *db; } @@ -111,7 +174,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr return entry->second.get(); } // catalog entry not found - look up table in main SQLite database - auto type = db->GetEntryType(entry_name); + auto type = GetDB().GetEntryType(entry_name); if (type == CatalogType::INVALID) { // no table or view found return nullptr; @@ -125,7 +188,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr if (context.lock()->TryGetCurrentSetting("sqlite_all_varchar", sqlite_all_varchar)) { all_varchar = BooleanValue::Get(sqlite_all_varchar); } - db->GetTableInfo(entry_name, info.columns, info.constraints, all_varchar); + GetDB().GetTableInfo(entry_name, info.columns, info.constraints, all_varchar); D_ASSERT(!info.columns.empty()); result = make_uniq(sqlite_catalog, sqlite_catalog.GetMainSchema(), info, all_varchar); @@ -133,7 +196,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr } case CatalogType::VIEW_ENTRY: { string sql; - db->GetViewInfo(entry_name, sql); + GetDB().GetViewInfo(entry_name, sql); unique_ptr view_info; try { @@ -153,7 +216,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr case CatalogType::INDEX_ENTRY: { string table_name; string sql; - db->GetIndexInfo(entry_name, sql, table_name); + GetDB().GetIndexInfo(entry_name, sql, table_name); if (sql.empty()) { throw InternalException("SQL is empty"); } @@ -200,7 +263,7 @@ string GetDropSQL(CatalogType type, const string &table_name, bool cascade) { void SQLiteTransaction::DropEntry(CatalogType type, const string &table_name, bool cascade) { catalog_entries.erase(table_name); - db->Execute(GetDropSQL(type, table_name, cascade)); + GetDB().Execute(GetDropSQL(type, table_name, cascade)); } } // namespace duckdb diff --git a/src/storage/sqlite_transaction_manager.cpp b/src/storage/sqlite_transaction_manager.cpp index d800794..c9598c7 100644 --- a/src/storage/sqlite_transaction_manager.cpp +++ b/src/storage/sqlite_transaction_manager.cpp @@ -3,15 +3,24 @@ namespace duckdb { +// Function-local static mutex to avoid Windows DLL initialization issues +mutex& SQLiteTransactionManager::GetTransactionLock() { + static mutex transaction_lock; + return transaction_lock; +} + SQLiteTransactionManager::SQLiteTransactionManager(AttachedDatabase &db_p, SQLiteCatalog &sqlite_catalog) : TransactionManager(db_p), sqlite_catalog(sqlite_catalog) { } Transaction &SQLiteTransactionManager::StartTransaction(ClientContext &context) { auto transaction = make_uniq(sqlite_catalog, *this, context); - transaction->Start(); + // Defer transaction start until first use to avoid potential deadlocks. + // Starting here would trigger DB connection initialization which can block + // for remote files while the MetaTransaction lock is held. + // The transaction will be started lazily in SQLiteTransaction::GetDB() auto &result = *transaction; - lock_guard l(transaction_lock); + lock_guard l(GetTransactionLock()); transactions[result] = std::move(transaction); return result; } @@ -19,7 +28,7 @@ Transaction &SQLiteTransactionManager::StartTransaction(ClientContext &context) ErrorData SQLiteTransactionManager::CommitTransaction(ClientContext &context, Transaction &transaction) { auto &sqlite_transaction = transaction.Cast(); sqlite_transaction.Commit(); - lock_guard l(transaction_lock); + lock_guard l(GetTransactionLock()); transactions.erase(transaction); return ErrorData(); } @@ -27,7 +36,7 @@ ErrorData SQLiteTransactionManager::CommitTransaction(ClientContext &context, Tr void SQLiteTransactionManager::RollbackTransaction(Transaction &transaction) { auto &sqlite_transaction = transaction.Cast(); sqlite_transaction.Rollback(); - lock_guard l(transaction_lock); + lock_guard l(GetTransactionLock()); transactions.erase(transaction); } diff --git a/test/sql/scanner/http_sqlite_00_vfs_registration.test b/test/sql/scanner/http_sqlite_00_vfs_registration.test new file mode 100644 index 0000000..fd85a46 --- /dev/null +++ b/test/sql/scanner/http_sqlite_00_vfs_registration.test @@ -0,0 +1,52 @@ +# name: test/sql/scanner/http_sqlite_00_vfs_registration.test +# description: Test HTTP VFS registration and URL detection for SQLite scanner +# group: [sqlite_scanner] + +require sqlite_scanner + +# Test 1: Verify SQLite scanner extension is available +statement ok +SELECT 'SQLite scanner extension loaded'; + +# Test 2: HTTP URLs should fail appropriately without httpfs loaded +statement error +SELECT * FROM sqlite_scan('http://example.com/test.db', 'test'); +---- +unable to open database file + +# Test 3: HTTPS URLs should fail appropriately without httpfs loaded +statement error +SELECT * FROM sqlite_scan('https://example.com/test.db', 'test'); +---- +unable to open database file + +# Test 4: S3 URLs should fail appropriately without httpfs loaded +statement error +SELECT * FROM sqlite_scan('s3://bucket/test.db', 'test'); +---- +unable to open database file + +# Test 5: Load httpfs to register HTTP VFS +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 6: After httpfs loads, HTTP URLs should be recognized but fail with network error (not VFS error) +statement error +SELECT * FROM sqlite_scan('http://nonexistent-domain-12345.com/test.db', 'test'); +---- +unable to open database file + +# Test 7: HTTPS URLs should also be recognized after httpfs loads +statement error +SELECT * FROM sqlite_scan('https://nonexistent-domain-12345.com/test.db', 'test'); +---- +unable to open database file + +# Test 8: Regular file paths should still work as before +statement error +SELECT * FROM sqlite_scan('nonexistent.db', 'test'); +---- +unable to open database file \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_01_basic_scan.test b/test/sql/scanner/http_sqlite_01_basic_scan.test new file mode 100644 index 0000000..089f647 --- /dev/null +++ b/test/sql/scanner/http_sqlite_01_basic_scan.test @@ -0,0 +1,28 @@ +# name: test/sql/scanner/http_sqlite_01_basic_scan.test +# description: Basic HTTP SQLite scan functionality +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 1: Basic remote SQLite query +query I +SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist'); +---- +275 + +# Test 2: Query with WHERE clause and ORDER BY +query IT +SELECT ArtistId, Name FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist') +WHERE Name LIKE 'A%' +ORDER BY Name +LIMIT 3; +---- +43 A Cor Do Som +1 AC/DC +230 Aaron Copland & London Symphony Orchestra \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_02_concurrent_scans.test b/test/sql/scanner/http_sqlite_02_concurrent_scans.test new file mode 100644 index 0000000..ba19847 --- /dev/null +++ b/test/sql/scanner/http_sqlite_02_concurrent_scans.test @@ -0,0 +1,20 @@ +# name: test/sql/scanner/http_sqlite_02_concurrent_scans.test +# description: Multiple concurrent scans of remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 3: Multiple concurrent scans +query III +SELECT + (SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist')) as artists, + (SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Album')) as albums, + (SELECT COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track')) as tracks; +---- +275 347 3503 \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_03_joins.test b/test/sql/scanner/http_sqlite_03_joins.test new file mode 100644 index 0000000..5ecb866 --- /dev/null +++ b/test/sql/scanner/http_sqlite_03_joins.test @@ -0,0 +1,25 @@ +# name: test/sql/scanner/http_sqlite_03_joins.test +# description: Joins between remote SQLite tables +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 4: Join between remote tables +query TI +SELECT ar.Name as Artist, COUNT(*) as AlbumCount +FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Artist') ar +JOIN sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Album') al +ON ar.ArtistId = al.ArtistId +GROUP BY ar.Name +ORDER BY AlbumCount DESC, ar.Name +LIMIT 3; +---- +Iron Maiden 21 +Led Zeppelin 14 +Deep Purple 11 \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_04_attach.test b/test/sql/scanner/http_sqlite_04_attach.test new file mode 100644 index 0000000..1eb8850 --- /dev/null +++ b/test/sql/scanner/http_sqlite_04_attach.test @@ -0,0 +1,24 @@ +# name: test/sql/scanner/http_sqlite_04_attach.test +# description: ATTACH remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 5: Attach remote database +statement ok +ATTACH 'https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite' AS http_db (TYPE sqlite); + +# Test 6: Query attached database +query I +SELECT COUNT(*) FROM http_db.Track; +---- +3503 + +statement ok +DETACH http_db; \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_05_complex_queries.test b/test/sql/scanner/http_sqlite_05_complex_queries.test new file mode 100644 index 0000000..d08a4f8 --- /dev/null +++ b/test/sql/scanner/http_sqlite_05_complex_queries.test @@ -0,0 +1,49 @@ +# name: test/sql/scanner/http_sqlite_05_complex_queries.test +# description: Complex queries on attached remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# First attach the database +statement ok +ATTACH 'https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite' AS http_db (TYPE sqlite); + +# Test 7: Complex aggregation on attached database +query TIR +SELECT g.Name as Genre, COUNT(*) as TrackCount, ROUND(AVG(t.Milliseconds/1000.0), 2) as AvgSeconds +FROM http_db.Track t +JOIN http_db.Genre g ON t.GenreId = g.GenreId +GROUP BY g.Name +ORDER BY TrackCount DESC +LIMIT 3; +---- +Rock 1297 283.91 +Latin 579 232.86 +Metal 374 309.75 + +# Test 8: Window functions on attached database +query TTII +SELECT + ar.Name as Artist, + al.Title as Album, + COUNT(t.TrackId) as TrackCount, + ROW_NUMBER() OVER (ORDER BY COUNT(t.TrackId) DESC, ar.Name, al.Title) as AlbumRank +FROM http_db.Artist ar +JOIN http_db.Album al ON ar.ArtistId = al.ArtistId +JOIN http_db.Track t ON al.AlbumId = t.AlbumId +GROUP BY ar.Name, al.Title +ORDER BY TrackCount DESC, ar.Name, al.Title +LIMIT 3; +---- +Lenny Kravitz Greatest Hits 57 1 +Chico Buarque Minha Historia 34 2 +Eric Clapton Unplugged 30 3 + +statement ok +DETACH http_db; \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_06_cte.test b/test/sql/scanner/http_sqlite_06_cte.test new file mode 100644 index 0000000..2852aed --- /dev/null +++ b/test/sql/scanner/http_sqlite_06_cte.test @@ -0,0 +1,27 @@ +# name: test/sql/scanner/http_sqlite_06_cte.test +# description: CTE with remote SQLite tables +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 9: CTE with remote tables +query TI +WITH top_genres AS ( + SELECT g.GenreId, g.Name, COUNT(*) as track_count + FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Genre') g + JOIN sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') t ON g.GenreId = t.GenreId + GROUP BY g.GenreId, g.Name + ORDER BY track_count DESC + LIMIT 3 +) +SELECT Name, track_count FROM top_genres ORDER BY Name; +---- +Latin 579 +Metal 374 +Rock 1297 \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_07_consistency.test b/test/sql/scanner/http_sqlite_07_consistency.test new file mode 100644 index 0000000..aa02b12 --- /dev/null +++ b/test/sql/scanner/http_sqlite_07_consistency.test @@ -0,0 +1,29 @@ +# name: test/sql/scanner/http_sqlite_07_consistency.test +# description: Consistency check for multiple scans +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 10: Consistency check - multiple scans return same results +query IIIT +WITH counts AS ( + SELECT 'scan1' as scan_id, COUNT(*) as count FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') + UNION ALL + SELECT 'scan2', COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') + UNION ALL + SELECT 'scan3', COUNT(*) FROM sqlite_scan('https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite', 'Track') +) +SELECT + COUNT(*) as scan_count, + MIN(count) as min_count, + MAX(count) as max_count, + CASE WHEN MIN(count) = MAX(count) THEN 'CONSISTENT' ELSE 'INCONSISTENT' END as consistency +FROM counts; +---- +3 3503 3503 CONSISTENT \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_08_error_handling.test b/test/sql/scanner/http_sqlite_08_error_handling.test new file mode 100644 index 0000000..be61b98 --- /dev/null +++ b/test/sql/scanner/http_sqlite_08_error_handling.test @@ -0,0 +1,29 @@ +# name: test/sql/scanner/http_sqlite_08_error_handling.test +# description: Error handling for remote SQLite databases +# group: [sqlite_scanner] + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test: Invalid domain (network error) +statement error +SELECT * FROM sqlite_scan('https://this-domain-does-not-exist-12345.invalid/test.db', 'test'); +---- +unable to open database file + +# Test: Valid file but not SQLite format +statement error +SELECT * FROM sqlite_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/README.md', 'test'); +---- +file is not a database + +# Test: Non-existent file on valid domain (404) +statement error +SELECT * FROM sqlite_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/definitely_nonexistent.db', 'test'); +---- +unable to open database file \ No newline at end of file diff --git a/test/sql/scanner/http_sqlite_09_http_errors.test b/test/sql/scanner/http_sqlite_09_http_errors.test new file mode 100644 index 0000000..df20d23 --- /dev/null +++ b/test/sql/scanner/http_sqlite_09_http_errors.test @@ -0,0 +1,47 @@ +# name: test/sql/scanner/http_sqlite_09_http_errors.test +# description: Test HTTP error code mapping to SQLite errors +# group: [sqlite_scanner] + +# Note: httpbin.org tests don't check specific error messages because: +# 1. httpbin.org may be rate-limited or down, causing connection errors +# 2. Different platforms may format HTTP errors differently +# 3. We still verify that these URLs produce errors, just not the exact message + +require sqlite_scanner + +statement ok +INSTALL httpfs; + +statement ok +LOAD httpfs; + +# Test 404 error - should map to "unable to open database file" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/404', 'test'); +---- + +# Test 403 error - should map to "access permission denied" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/403', 'test'); +---- + +# Test 401 error - should also map to "access permission denied" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/401', 'test'); +---- + +# Test 500 error - should map to "unable to open database file" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/500', 'test'); +---- + +# Test 429 error - should map to "database is locked" (but may fail to connect) +statement error +SELECT * FROM sqlite_scan('https://httpbin.org/status/429', 'test'); +---- + +# Test with non-existent GitHub file (real-world 404 scenario) +statement error +SELECT * FROM sqlite_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/nonexistent_file_404.db', 'test'); +---- +unable to open database file \ No newline at end of file