diff --git a/Cargo.lock b/Cargo.lock index f40aa718..bc91cb76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "addr2line" version = "0.24.2" @@ -121,6 +127,28 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236" +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + +[[package]] +name = "async-lock" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" +dependencies = [ + "event-listener 5.4.0", + "event-listener-strategy", + "pin-project-lite", +] + [[package]] name = "async-openai" version = "0.28.3" @@ -231,7 +259,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.3.1", "ring", @@ -293,7 +321,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -322,7 +350,7 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "hmac", "http 0.2.12", @@ -352,7 +380,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -374,7 +402,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -396,7 +424,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -419,7 +447,7 @@ dependencies = [ "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -586,7 +614,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -789,6 +817,92 @@ dependencies = [ "tower-service", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.16", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backoff" version = "0.4.0" @@ -824,6 +938,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -1056,6 +1176,9 @@ dependencies = [ "aws-sdk-sqs", "axum 0.8.4", "axum-extra", + "azure_core", + "azure_storage", + "azure_storage_blobs", "base64 0.22.1", "blake2", "bytes", @@ -1074,7 +1197,7 @@ dependencies = [ "indenter", "indexmap 2.10.0", "indoc", - "infer", + "infer 0.19.0", "itertools 0.14.0", "json5", "log", @@ -1096,6 +1219,7 @@ dependencies = [ "serde_json", "serde_with", "sqlx", + "time", "tokio", "tokio-stream", "tower 0.5.2", @@ -1635,6 +1759,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.0" @@ -1646,6 +1776,16 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener 5.4.0", + "pin-project-lite", +] + [[package]] name = "eventsource-stream" version = "0.2.3" @@ -1657,6 +1797,15 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -1706,6 +1855,21 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1780,6 +1944,21 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -1837,6 +2016,17 @@ dependencies = [ "version_check", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -2127,6 +2317,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel", + "base64 0.13.1", + "futures-lite", + "infer 0.2.3", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -2232,6 +2442,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.6.0", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.15" @@ -2427,6 +2653,12 @@ version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "infer" version = "0.19.0" @@ -2619,6 +2851,12 @@ version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + [[package]] name = "litemap" version = "0.8.0" @@ -2745,6 +2983,23 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -2926,12 +3181,50 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "openssl" +version = "0.10.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] + [[package]] name = "openssl-probe" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "openssl-sys" +version = "0.9.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "ordered-multimap" version = "0.7.3" @@ -3402,6 +3695,16 @@ dependencies = [ "tonic", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quinn" version = "0.11.8" @@ -3472,6 +3775,19 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -3493,6 +3809,16 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -3513,6 +3839,15 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -3531,6 +3866,15 @@ dependencies = [ "getrandom 0.3.3", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rawpointer" version = "0.2.1" @@ -3617,10 +3961,12 @@ dependencies = [ "http-body-util", "hyper 1.6.0", "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -3632,6 +3978,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.2", "tokio-util", "tower 0.5.2", @@ -3770,7 +4117,20 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.9.4", "windows-sys 0.59.0", ] @@ -4113,6 +4473,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_spanned" version = "0.6.9" @@ -4308,7 +4679,7 @@ dependencies = [ "crc", "crossbeam-queue", "either", - "event-listener", + "event-listener 5.4.0", "futures-core", "futures-intrusive", "futures-io", @@ -4562,6 +4933,19 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +[[package]] +name = "tempfile" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +dependencies = [ + "fastrand 2.3.0", + "getrandom 0.3.3", + "once_cell", + "rustix 1.0.7", + "windows-sys 0.59.0", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -4610,6 +4994,7 @@ checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" dependencies = [ "deranged", "itoa", + "js-sys", "libc", "num-conv", "num_threads", @@ -4701,6 +5086,16 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -5263,6 +5658,7 @@ dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -5313,6 +5709,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "want" version = "0.3.1" @@ -5322,6 +5724,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -5474,7 +5882,7 @@ dependencies = [ "either", "home", "once_cell", - "rustix", + "rustix 0.38.44", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4f439b1a..2dcf56b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,6 +114,10 @@ json5 = "0.4.1" aws-config = "1.6.2" aws-sdk-s3 = "1.85.0" aws-sdk-sqs = "1.67.0" +azure_core = "0.21.0" +azure_storage = "0.21.0" +azure_storage_blobs = "0.21.0" +time = { version = "0.3", features = ["macros", "serde"] } numpy = "0.25.0" infer = "0.19.0" serde_with = { version = "3.13.0", features = ["base64"] } diff --git a/README.md b/README.md index 8535c3ff..3b52c2f4 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Ultra performant data transformation framework for AI, with core engine written
-CocoIndex makes it super easy to transform data with AI workloads, and keep source data and target in sync effortlessly. +CocoIndex makes it super easy to transform data with AI workloads, and keep source data and target in sync effortlessly.
@@ -39,7 +39,7 @@ CocoIndex makes it super easy to transform data with AI workloads, and keep sour
-Either creating embedding, building knowledge graphs, or any data transformations - beyond traditional SQL. +Either creating embedding, building knowledge graphs, or any data transformations - beyond traditional SQL. ## Exceptional velocity Just declare transformation in dataflow with ~100 lines of python @@ -65,7 +65,7 @@ CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_ **Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data. ## Build like LEGO -Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components. +Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components.

CocoIndex Features diff --git a/examples/azure_blob_embedding/.env.example b/examples/azure_blob_embedding/.env.example new file mode 100644 index 00000000..abd44053 --- /dev/null +++ b/examples/azure_blob_embedding/.env.example @@ -0,0 +1,22 @@ +# Database Configuration +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex + +# Azure Blob Storage Configuration (Public test container - ready to use!) +AZURE_STORAGE_ACCOUNT_NAME=testnamecocoindex1 +AZURE_BLOB_CONTAINER_NAME=testpublic1 +AZURE_BLOB_PREFIX= + +# Authentication Options (choose ONE - in priority order): + +# Option 1: Connection String (HIGHEST PRIORITY - recommended for development) +# NOTE: Use ACCOUNT KEY connection string, NOT SAS connection string! +# AZURE_BLOB_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=testnamecocoindex1;AccountKey=key1-goes-here;EndpointSuffix=core.windows.net + +# Option 2: SAS Token (SECOND PRIORITY - recommended for production) +# AZURE_BLOB_SAS_TOKEN=sp=r&st=2024-01-01T00:00:00Z&se=2025-12-31T23:59:59Z&spr=https&sv=2022-11-02&sr=c&sig=... + +# Option 3: Account Key (THIRD PRIORITY) +# AZURE_BLOB_ACCOUNT_KEY=key1-goes-here + +# Option 4: Anonymous access (FALLBACK - for public containers only) +# Leave all auth options commented out - testpublic1 container supports this! diff --git a/examples/azure_blob_embedding/README.md b/examples/azure_blob_embedding/README.md new file mode 100644 index 00000000..89986df0 --- /dev/null +++ b/examples/azure_blob_embedding/README.md @@ -0,0 +1,153 @@ +This example builds an embedding index based on files stored in an Azure Blob Storage container. +It continuously updates the index as files are added / updated / deleted in the source container: +it keeps the index in sync with the Azure Blob Storage container effortlessly. + +## Quick Start (Public Test Container) + +🚀 **Try it immediately!** We provide a public test container with sample documents: +- **Account:** `testnamecocoindex1` +- **Container:** `testpublic1` (public access) +- **No authentication required!** + +Just copy `.env.example` to `.env` and run - it works out of the box with anonymous access. + +## Prerequisite + +Before running the example, you need to: + +1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. + +2. Prepare for Azure Blob Storage. + You'll need an Azure Storage account and container. Supported authentication methods: + - **Connection String** (recommended for development) + - **SAS Token** (recommended for production) + - **Account Key** (full access) + - **Anonymous access** (for public containers only) + +3. Create a `.env` file with your Azure Blob Storage configuration. + Start from copying the `.env.example`, and then edit it to fill in your credentials. + + ```bash + cp .env.example .env + $EDITOR .env + ``` + + Example `.env` file with connection string: + ``` + # Database Configuration + DATABASE_URL=postgresql://localhost:5432/cocoindex + + # Azure Blob Storage Configuration + AZURE_STORAGE_ACCOUNT_NAME=mystorageaccount + AZURE_BLOB_CONTAINER_NAME=mydocuments + AZURE_BLOB_PREFIX= + + # Authentication (choose one) + AZURE_BLOB_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=mystorageaccount;AccountKey=mykey123;EndpointSuffix=core.windows.net + ``` + +## Run + +Install dependencies: + +```sh +pip install -e . +``` + +Run: + +```sh +python main.py +``` + +During running, it will keep observing changes in the Azure Blob Storage container and update the index automatically. +At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index. + + +## CocoInsight +CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9). + +Run CocoInsight to understand your RAG data pipeline: + +```sh +cocoindex server -ci main.py +``` + +You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time: + +```sh +cocoindex server -ci -L main.py +``` + +Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). + +## Authentication Methods & Troubleshooting + +### Connection String (Recommended for Development) +```bash +AZURE_BLOB_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=testnamecocoindex1;AccountKey=your-key;EndpointSuffix=core.windows.net" +``` +- **Pros:** Easiest to set up, contains all necessary information +- **Cons:** Contains account key (full access) +- **⚠️ Important:** Use **Account Key** connection string, NOT SAS connection string! + +### SAS Token (Recommended for Production) +```bash +AZURE_BLOB_SAS_TOKEN="sp=r&st=2024-01-01T00:00:00Z&se=2025-12-31T23:59:59Z&spr=https&sv=2022-11-02&sr=c&sig=..." +``` +- **Pros:** Fine-grained permissions, time-limited +- **Cons:** More complex to generate and manage + +**SAS Token Requirements:** +- `sp=r` - Read permission (required) +- `sp=rl` - Read + List permissions (recommended) +- `sr=c` - Container scope (to access all blobs) +- Valid time range (`st` and `se` in UTC) + +### Account Key +```bash +AZURE_BLOB_ACCOUNT_KEY="your-account-key-here" +``` +- **Pros:** Simple to use +- **Cons:** Full account access, security risk + +### Anonymous Access +Leave all authentication options empty - only works with public containers. + +## Common Issues + +### 401 Authentication Error +``` +Error: server returned error status which will not be retried: 401 +Error Code: NoAuthenticationInformation +``` + +**Solutions:** +1. **Check authentication priority:** Connection String > SAS Token > Account Key > Anonymous +2. **Verify SAS token permissions:** Must include `r` (read) and `l` (list) permissions +3. **Check SAS token expiry:** Ensure `se` (expiry time) is in the future +4. **Verify container scope:** Use `sr=c` for container-level access + +### Connection String Issues + +**⚠️ CRITICAL: Use Account Key Connection String, NOT SAS Connection String!** + +**✅ Correct (Account Key Connection String):** +``` +DefaultEndpointsProtocol=https;AccountName=testnamecocoindex1;AccountKey=your-key;EndpointSuffix=core.windows.net +``` + +**❌ Wrong (SAS Connection String - will not work):** +``` +BlobEndpoint=https://testnamecocoindex1.blob.core.windows.net/;SharedAccessSignature=sp=r&st=... +``` + +**Other tips:** +- Don't include quotes in the actual connection string value +- Account name in connection string should match `AZURE_STORAGE_ACCOUNT_NAME` +- Connection string must contain `AccountKey=` parameter + +### Container Access Issues +- Verify container exists and account has access +- Check `AZURE_BLOB_CONTAINER_NAME` spelling +- For anonymous access, container must be public diff --git a/examples/azure_blob_embedding/main.py b/examples/azure_blob_embedding/main.py new file mode 100644 index 00000000..d657ee8e --- /dev/null +++ b/examples/azure_blob_embedding/main.py @@ -0,0 +1,131 @@ +from dotenv import load_dotenv +from psycopg_pool import ConnectionPool +import cocoindex +import os +from typing import Any + + +@cocoindex.transform_flow() +def text_to_embedding( + text: cocoindex.DataSlice[str], +) -> cocoindex.DataSlice[list[float]]: + """ + Embed the text using a SentenceTransformer model. + This is a shared logic between indexing and querying, so extract it as a function. + """ + return text.transform( + cocoindex.functions.SentenceTransformerEmbed( + model="sentence-transformers/all-MiniLM-L6-v2" + ) + ) + + +@cocoindex.flow_def(name="AzureBlobTextEmbedding") +def azure_blob_text_embedding_flow( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +) -> None: + """ + Define an example flow that embeds text from Azure Blob Storage into a vector database. + """ + account_name = os.environ["AZURE_STORAGE_ACCOUNT_NAME"] + container_name = os.environ["AZURE_BLOB_CONTAINER_NAME"] + prefix = os.environ.get("AZURE_BLOB_PREFIX", None) + + # Authentication options (in priority order) + connection_string = os.environ.get("AZURE_BLOB_CONNECTION_STRING", None) + account_key = os.environ.get("AZURE_BLOB_ACCOUNT_KEY", None) + sas_token = os.environ.get("AZURE_BLOB_SAS_TOKEN", None) + + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.AzureBlob( + account_name=account_name, + container_name=container_name, + prefix=prefix, + included_patterns=["*.md", "*.mdx", "*.txt", "*.docx"], + binary=False, + connection_string=connection_string, + account_key=account_key, + sas_token=sas_token, + ) + ) + + doc_embeddings = data_scope.add_collector() + + with data_scope["documents"].row() as doc: + doc["chunks"] = doc["content"].transform( + cocoindex.functions.SplitRecursively(), + language="markdown", + chunk_size=2000, + chunk_overlap=500, + ) + + with doc["chunks"].row() as chunk: + chunk["embedding"] = text_to_embedding(chunk["text"]) + doc_embeddings.collect( + filename=doc["filename"], + location=chunk["location"], + text=chunk["text"], + embedding=chunk["embedding"], + ) + + doc_embeddings.export( + "doc_embeddings", + cocoindex.targets.Postgres(), + primary_key_fields=["filename", "location"], + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, + ) + ], + ) + + +def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]: + # Get the table name, for the export target in the azure_blob_text_embedding_flow above. + table_name = cocoindex.utils.get_target_default_name( + azure_blob_text_embedding_flow, "doc_embeddings" + ) + # Evaluate the transform flow defined above with the input query, to get the embedding. + query_vector = text_to_embedding.eval(query) + # Run the query and get the results. + with pool.connection() as conn: + with conn.cursor() as cur: + cur.execute( + f""" + SELECT filename, text, embedding <=> %s::vector AS distance + FROM {table_name} ORDER BY distance LIMIT %s + """, + (query_vector, top_k), + ) + return [ + {"filename": row[0], "text": row[1], "score": 1.0 - row[2]} + for row in cur.fetchall() + ] + + +def _main() -> None: + # Initialize the database connection pool. + pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) + + azure_blob_text_embedding_flow.setup() + with cocoindex.FlowLiveUpdater(azure_blob_text_embedding_flow): + # Run queries in a loop to demonstrate the query capabilities. + while True: + query = input("Enter search query (or Enter to quit): ") + if query == "": + break + # Run the query function with the database connection pool and the query. + results = search(pool, query) + print("\nSearch results:") + for result in results: + print(f"[{result['score']:.3f}] {result['filename']}") + print(f" {result['text']}") + print("---") + print() + + +if __name__ == "__main__": + load_dotenv() + cocoindex.init() + _main() diff --git a/examples/azure_blob_embedding/pyproject.toml b/examples/azure_blob_embedding/pyproject.toml new file mode 100644 index 00000000..ffb50203 --- /dev/null +++ b/examples/azure_blob_embedding/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "azure-blob-text-embedding" +version = "0.1.0" +description = "Simple example for cocoindex: build embedding index based on Azure Blob Storage files." +requires-python = ">=3.11" +dependencies = ["cocoindex[embeddings]>=0.1.63", "python-dotenv>=1.0.1"] + +[tool.setuptools] +packages = [] diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py index d18f9934..0b648b38 100644 --- a/python/cocoindex/sources.py +++ b/python/cocoindex/sources.py @@ -43,3 +43,26 @@ class AmazonS3(op.SourceSpec): included_patterns: list[str] | None = None excluded_patterns: list[str] | None = None sqs_queue_url: str | None = None + + +class AzureBlob(op.SourceSpec): + """Import data from an Azure Blob Storage container. Supports optional prefix and file filtering by glob patterns. + + Authentication options (in priority order): + 1. connection_string - Full connection string with credentials + 2. sas_token - Shared Access Signature token + 3. account_key - Storage account access key + 4. None - Anonymous access (for public containers) + """ + + _op_category = op.OpCategory.SOURCE + + account_name: str + container_name: str + prefix: str | None = None + binary: bool = False + included_patterns: list[str] | None = None + excluded_patterns: list[str] | None = None + account_key: str | None = None + sas_token: str | None = None + connection_string: str | None = None diff --git a/src/ops/registration.rs b/src/ops/registration.rs index a7710b9c..b000ffa2 100644 --- a/src/ops/registration.rs +++ b/src/ops/registration.rs @@ -11,6 +11,7 @@ fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result sources::local_file::Factory.register(registry)?; sources::google_drive::Factory.register(registry)?; sources::amazon_s3::Factory.register(registry)?; + sources::azure_blob::Factory.register(registry)?; functions::parse_json::Factory.register(registry)?; functions::split_recursively::register(registry)?; diff --git a/src/ops/sources/azure_blob.rs b/src/ops/sources/azure_blob.rs new file mode 100644 index 00000000..eed979d8 --- /dev/null +++ b/src/ops/sources/azure_blob.rs @@ -0,0 +1,265 @@ +use crate::fields_value; +use async_stream::try_stream; +use azure_storage::prelude::*; +use azure_storage_blobs::prelude::*; +use futures::StreamExt; +use globset::{Glob, GlobSet, GlobSetBuilder}; +use std::sync::Arc; + +use crate::base::field_attrs; +use crate::ops::sdk::*; + +#[derive(Debug, Deserialize)] +pub struct Spec { + account_name: String, + container_name: String, + prefix: Option, + binary: bool, + included_patterns: Option>, + excluded_patterns: Option>, + /// Account key for authentication. If not provided, will use anonymous access. + account_key: Option, + /// SAS token for authentication. Takes precedence over account_key. + sas_token: Option, + /// Connection string for authentication. Takes precedence over other auth methods. + connection_string: Option, +} + +struct Executor { + client: BlobServiceClient, + container_name: String, + prefix: Option, + binary: bool, + included_glob_set: Option, + excluded_glob_set: Option, +} + +impl Executor { + fn is_excluded(&self, key: &str) -> bool { + self.excluded_glob_set + .as_ref() + .is_some_and(|glob_set| glob_set.is_match(key)) + } + + fn is_file_included(&self, key: &str) -> bool { + self.included_glob_set + .as_ref() + .is_none_or(|glob_set| glob_set.is_match(key)) + && !self.is_excluded(key) + } +} + +fn datetime_to_ordinal(dt: &time::OffsetDateTime) -> Ordinal { + Ordinal(Some(dt.unix_timestamp_nanos() as i64 / 1000)) +} + +#[async_trait] +impl SourceExecutor for Executor { + fn list<'a>( + &'a self, + _options: &'a SourceExecutorListOptions, + ) -> BoxStream<'a, Result>> { + try_stream! { + let mut continuation_token: Option = None; + loop { + let mut list_builder = self.client + .container_client(&self.container_name) + .list_blobs(); + + if let Some(p) = &self.prefix { + list_builder = list_builder.prefix(p.clone()); + } + + if let Some(ref token) = continuation_token { + list_builder = list_builder.marker(token.as_str()); + } + + let mut page_stream = list_builder.into_stream(); + + if let Some(page_result) = page_stream.next().await { + let page = page_result?; + let mut batch = Vec::new(); + + for blob in page.blobs.blobs() { + let key = &blob.name; + + // Only include files (not directories) + if key.ends_with('/') { continue; } + + if self.is_file_included(key) { + let ordinal = Some(datetime_to_ordinal(&blob.properties.last_modified)); + batch.push(PartialSourceRowMetadata { + key: KeyValue::Str(key.clone().into()), + ordinal, + }); + } + } + + if !batch.is_empty() { + yield batch; + } + + if let Some(next_marker) = page.next_marker { + continuation_token = Some(format!("{:?}", next_marker)); + } else { + break; + } + } else { + break; + } + } + } + .boxed() + } + + async fn get_value( + &self, + key: &KeyValue, + options: &SourceExecutorGetOptions, + ) -> Result { + let key_str = key.str_value()?; + if !self.is_file_included(key_str) { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + }); + } + + let blob_client = self + .client + .container_client(&self.container_name) + .blob_client(key_str.as_ref()); + + let mut stream = blob_client.get().into_stream(); + let result = stream.next().await; + + let blob_response = match result { + Some(Ok(response)) => response, + Some(Err(_)) | None => { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + }); + } + }; + + let ordinal = if options.include_ordinal { + Some(datetime_to_ordinal( + &blob_response.blob.properties.last_modified, + )) + } else { + None + }; + + let value = if options.include_value { + let bytes = blob_response.data.collect().await?; + Some(SourceValue::Existence(if self.binary { + fields_value!(bytes) + } else { + fields_value!(String::from_utf8_lossy(&bytes).to_string()) + })) + } else { + None + }; + + Ok(PartialSourceRowData { value, ordinal }) + } + + async fn change_stream( + &self, + ) -> Result>>> { + // Azure Blob Storage doesn't have built-in change notifications like S3+SQS + Ok(None) + } +} + +pub struct Factory; + +#[async_trait] +impl SourceFactoryBase for Factory { + type Spec = Spec; + + fn name(&self) -> &str { + "AzureBlob" + } + + async fn get_output_schema( + &self, + spec: &Spec, + _context: &FlowInstanceContext, + ) -> Result { + let mut struct_schema = StructSchema::default(); + let mut schema_builder = StructSchemaBuilder::new(&mut struct_schema); + let filename_field = schema_builder.add_field(FieldSchema::new( + "filename", + make_output_type(BasicValueType::Str), + )); + schema_builder.add_field(FieldSchema::new( + "content", + make_output_type(if spec.binary { + BasicValueType::Bytes + } else { + BasicValueType::Str + }) + .with_attr( + field_attrs::CONTENT_FILENAME, + serde_json::to_value(filename_field.to_field_ref())?, + ), + )); + Ok(make_output_type(TableSchema::new( + TableKind::KTable, + struct_schema, + ))) + } + + async fn build_executor( + self: Arc, + spec: Spec, + _context: Arc, + ) -> Result> { + let account_name = spec.account_name.clone(); + let storage_credentials = if let Some(connection_string) = spec.connection_string { + // Parse connection string to extract account key + // Format: "DefaultEndpointsProtocol=https;AccountName=myaccount;AccountKey=mykey;EndpointSuffix=core.windows.net" + let mut account_key_from_conn_str = None; + for part in connection_string.split(';') { + if let Some(key) = part.strip_prefix("AccountKey=") { + account_key_from_conn_str = Some(key.to_string()); + break; + } + } + if let Some(key) = account_key_from_conn_str { + StorageCredentials::access_key(&account_name, key) + } else { + return Err(anyhow::anyhow!( + "Invalid connection string: missing AccountKey" + )); + } + } else if let Some(sas_token) = spec.sas_token { + StorageCredentials::sas_token(sas_token)? + } else if let Some(account_key) = spec.account_key { + StorageCredentials::access_key(&account_name, account_key) + } else { + StorageCredentials::anonymous() + }; + + let client = BlobServiceClient::new(&account_name, storage_credentials); + + Ok(Box::new(Executor { + client, + container_name: spec.container_name, + prefix: spec.prefix, + binary: spec.binary, + included_glob_set: spec.included_patterns.map(build_glob_set).transpose()?, + excluded_glob_set: spec.excluded_patterns.map(build_glob_set).transpose()?, + })) + } +} + +fn build_glob_set(patterns: Vec) -> Result { + let mut builder = GlobSetBuilder::new(); + for pattern in patterns { + builder.add(Glob::new(pattern.as_str())?); + } + Ok(builder.build()?) +} diff --git a/src/ops/sources/mod.rs b/src/ops/sources/mod.rs index 557f44f7..a7725aaf 100644 --- a/src/ops/sources/mod.rs +++ b/src/ops/sources/mod.rs @@ -1,3 +1,4 @@ pub mod amazon_s3; +pub mod azure_blob; pub mod google_drive; pub mod local_file;