diff --git a/Cargo.lock b/Cargo.lock index f40aa718..bc91cb76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "addr2line" version = "0.24.2" @@ -121,6 +127,28 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236" +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + +[[package]] +name = "async-lock" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" +dependencies = [ + "event-listener 5.4.0", + "event-listener-strategy", + "pin-project-lite", +] + [[package]] name = "async-openai" version = "0.28.3" @@ -231,7 +259,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.3.1", "ring", @@ -293,7 +321,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -322,7 +350,7 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "hmac", "http 0.2.12", @@ -352,7 +380,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -374,7 +402,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -396,7 +424,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -419,7 +447,7 @@ dependencies = [ "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -586,7 +614,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -789,6 +817,92 @@ dependencies = [ "tower-service", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.16", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backoff" version = "0.4.0" @@ -824,6 +938,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -1056,6 +1176,9 @@ dependencies = [ "aws-sdk-sqs", "axum 0.8.4", "axum-extra", + "azure_core", + "azure_storage", + "azure_storage_blobs", "base64 0.22.1", "blake2", "bytes", @@ -1074,7 +1197,7 @@ dependencies = [ "indenter", "indexmap 2.10.0", "indoc", - "infer", + "infer 0.19.0", "itertools 0.14.0", "json5", "log", @@ -1096,6 +1219,7 @@ dependencies = [ "serde_json", "serde_with", "sqlx", + "time", "tokio", "tokio-stream", "tower 0.5.2", @@ -1635,6 +1759,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.0" @@ -1646,6 +1776,16 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener 5.4.0", + "pin-project-lite", +] + [[package]] name = "eventsource-stream" version = "0.2.3" @@ -1657,6 +1797,15 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -1706,6 +1855,21 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1780,6 +1944,21 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -1837,6 +2016,17 @@ dependencies = [ "version_check", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -2127,6 +2317,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel", + "base64 0.13.1", + "futures-lite", + "infer 0.2.3", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -2232,6 +2442,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.6.0", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.15" @@ -2427,6 +2653,12 @@ version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "infer" version = "0.19.0" @@ -2619,6 +2851,12 @@ version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + [[package]] name = "litemap" version = "0.8.0" @@ -2745,6 +2983,23 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -2926,12 +3181,50 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "openssl" +version = "0.10.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] + [[package]] name = "openssl-probe" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "openssl-sys" +version = "0.9.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "ordered-multimap" version = "0.7.3" @@ -3402,6 +3695,16 @@ dependencies = [ "tonic", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quinn" version = "0.11.8" @@ -3472,6 +3775,19 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -3493,6 +3809,16 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -3513,6 +3839,15 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -3531,6 +3866,15 @@ dependencies = [ "getrandom 0.3.3", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rawpointer" version = "0.2.1" @@ -3617,10 +3961,12 @@ dependencies = [ "http-body-util", "hyper 1.6.0", "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -3632,6 +3978,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.2", "tokio-util", "tower 0.5.2", @@ -3770,7 +4117,20 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.9.4", "windows-sys 0.59.0", ] @@ -4113,6 +4473,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_spanned" version = "0.6.9" @@ -4308,7 +4679,7 @@ dependencies = [ "crc", "crossbeam-queue", "either", - "event-listener", + "event-listener 5.4.0", "futures-core", "futures-intrusive", "futures-io", @@ -4562,6 +4933,19 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +[[package]] +name = "tempfile" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +dependencies = [ + "fastrand 2.3.0", + "getrandom 0.3.3", + "once_cell", + "rustix 1.0.7", + "windows-sys 0.59.0", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -4610,6 +4994,7 @@ checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" dependencies = [ "deranged", "itoa", + "js-sys", "libc", "num-conv", "num_threads", @@ -4701,6 +5086,16 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -5263,6 +5658,7 @@ dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -5313,6 +5709,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "want" version = "0.3.1" @@ -5322,6 +5724,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -5474,7 +5882,7 @@ dependencies = [ "either", "home", "once_cell", - "rustix", + "rustix 0.38.44", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4f439b1a..2dcf56b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,6 +114,10 @@ json5 = "0.4.1" aws-config = "1.6.2" aws-sdk-s3 = "1.85.0" aws-sdk-sqs = "1.67.0" +azure_core = "0.21.0" +azure_storage = "0.21.0" +azure_storage_blobs = "0.21.0" +time = { version = "0.3", features = ["macros", "serde"] } numpy = "0.25.0" infer = "0.19.0" serde_with = { version = "3.13.0", features = ["base64"] } diff --git a/README.md b/README.md index 8535c3ff..3b52c2f4 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Ultra performant data transformation framework for AI, with core engine written -CocoIndex makes it super easy to transform data with AI workloads, and keep source data and target in sync effortlessly. +CocoIndex makes it super easy to transform data with AI workloads, and keep source data and target in sync effortlessly. @@ -39,7 +39,7 @@ CocoIndex makes it super easy to transform data with AI workloads, and keep sour -Either creating embedding, building knowledge graphs, or any data transformations - beyond traditional SQL. +Either creating embedding, building knowledge graphs, or any data transformations - beyond traditional SQL. ## Exceptional velocity Just declare transformation in dataflow with ~100 lines of python @@ -65,7 +65,7 @@ CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_ **Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data. ## Build like LEGO -Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components. +Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components.
diff --git a/examples/azure_blob_embedding/.env.example b/examples/azure_blob_embedding/.env.example
new file mode 100644
index 00000000..abd44053
--- /dev/null
+++ b/examples/azure_blob_embedding/.env.example
@@ -0,0 +1,22 @@
+# Database Configuration
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Azure Blob Storage Configuration (Public test container - ready to use!)
+AZURE_STORAGE_ACCOUNT_NAME=testnamecocoindex1
+AZURE_BLOB_CONTAINER_NAME=testpublic1
+AZURE_BLOB_PREFIX=
+
+# Authentication Options (choose ONE - in priority order):
+
+# Option 1: Connection String (HIGHEST PRIORITY - recommended for development)
+# NOTE: Use ACCOUNT KEY connection string, NOT SAS connection string!
+# AZURE_BLOB_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=testnamecocoindex1;AccountKey=key1-goes-here;EndpointSuffix=core.windows.net
+
+# Option 2: SAS Token (SECOND PRIORITY - recommended for production)
+# AZURE_BLOB_SAS_TOKEN=sp=r&st=2024-01-01T00:00:00Z&se=2025-12-31T23:59:59Z&spr=https&sv=2022-11-02&sr=c&sig=...
+
+# Option 3: Account Key (THIRD PRIORITY)
+# AZURE_BLOB_ACCOUNT_KEY=key1-goes-here
+
+# Option 4: Anonymous access (FALLBACK - for public containers only)
+# Leave all auth options commented out - testpublic1 container supports this!
diff --git a/examples/azure_blob_embedding/README.md b/examples/azure_blob_embedding/README.md
new file mode 100644
index 00000000..89986df0
--- /dev/null
+++ b/examples/azure_blob_embedding/README.md
@@ -0,0 +1,153 @@
+This example builds an embedding index based on files stored in an Azure Blob Storage container.
+It continuously updates the index as files are added / updated / deleted in the source container:
+it keeps the index in sync with the Azure Blob Storage container effortlessly.
+
+## Quick Start (Public Test Container)
+
+🚀 **Try it immediately!** We provide a public test container with sample documents:
+- **Account:** `testnamecocoindex1`
+- **Container:** `testpublic1` (public access)
+- **No authentication required!**
+
+Just copy `.env.example` to `.env` and run - it works out of the box with anonymous access.
+
+## Prerequisite
+
+Before running the example, you need to:
+
+1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+
+2. Prepare for Azure Blob Storage.
+ You'll need an Azure Storage account and container. Supported authentication methods:
+ - **Connection String** (recommended for development)
+ - **SAS Token** (recommended for production)
+ - **Account Key** (full access)
+ - **Anonymous access** (for public containers only)
+
+3. Create a `.env` file with your Azure Blob Storage configuration.
+ Start from copying the `.env.example`, and then edit it to fill in your credentials.
+
+ ```bash
+ cp .env.example .env
+ $EDITOR .env
+ ```
+
+ Example `.env` file with connection string:
+ ```
+ # Database Configuration
+ DATABASE_URL=postgresql://localhost:5432/cocoindex
+
+ # Azure Blob Storage Configuration
+ AZURE_STORAGE_ACCOUNT_NAME=mystorageaccount
+ AZURE_BLOB_CONTAINER_NAME=mydocuments
+ AZURE_BLOB_PREFIX=
+
+ # Authentication (choose one)
+ AZURE_BLOB_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=mystorageaccount;AccountKey=mykey123;EndpointSuffix=core.windows.net
+ ```
+
+## Run
+
+Install dependencies:
+
+```sh
+pip install -e .
+```
+
+Run:
+
+```sh
+python main.py
+```
+
+During running, it will keep observing changes in the Azure Blob Storage container and update the index automatically.
+At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
+
+
+## CocoInsight
+CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
+
+Run CocoInsight to understand your RAG data pipeline:
+
+```sh
+cocoindex server -ci main.py
+```
+
+You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time:
+
+```sh
+cocoindex server -ci -L main.py
+```
+
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
+
+## Authentication Methods & Troubleshooting
+
+### Connection String (Recommended for Development)
+```bash
+AZURE_BLOB_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=testnamecocoindex1;AccountKey=your-key;EndpointSuffix=core.windows.net"
+```
+- **Pros:** Easiest to set up, contains all necessary information
+- **Cons:** Contains account key (full access)
+- **⚠️ Important:** Use **Account Key** connection string, NOT SAS connection string!
+
+### SAS Token (Recommended for Production)
+```bash
+AZURE_BLOB_SAS_TOKEN="sp=r&st=2024-01-01T00:00:00Z&se=2025-12-31T23:59:59Z&spr=https&sv=2022-11-02&sr=c&sig=..."
+```
+- **Pros:** Fine-grained permissions, time-limited
+- **Cons:** More complex to generate and manage
+
+**SAS Token Requirements:**
+- `sp=r` - Read permission (required)
+- `sp=rl` - Read + List permissions (recommended)
+- `sr=c` - Container scope (to access all blobs)
+- Valid time range (`st` and `se` in UTC)
+
+### Account Key
+```bash
+AZURE_BLOB_ACCOUNT_KEY="your-account-key-here"
+```
+- **Pros:** Simple to use
+- **Cons:** Full account access, security risk
+
+### Anonymous Access
+Leave all authentication options empty - only works with public containers.
+
+## Common Issues
+
+### 401 Authentication Error
+```
+Error: server returned error status which will not be retried: 401
+Error Code: NoAuthenticationInformation
+```
+
+**Solutions:**
+1. **Check authentication priority:** Connection String > SAS Token > Account Key > Anonymous
+2. **Verify SAS token permissions:** Must include `r` (read) and `l` (list) permissions
+3. **Check SAS token expiry:** Ensure `se` (expiry time) is in the future
+4. **Verify container scope:** Use `sr=c` for container-level access
+
+### Connection String Issues
+
+**⚠️ CRITICAL: Use Account Key Connection String, NOT SAS Connection String!**
+
+**✅ Correct (Account Key Connection String):**
+```
+DefaultEndpointsProtocol=https;AccountName=testnamecocoindex1;AccountKey=your-key;EndpointSuffix=core.windows.net
+```
+
+**❌ Wrong (SAS Connection String - will not work):**
+```
+BlobEndpoint=https://testnamecocoindex1.blob.core.windows.net/;SharedAccessSignature=sp=r&st=...
+```
+
+**Other tips:**
+- Don't include quotes in the actual connection string value
+- Account name in connection string should match `AZURE_STORAGE_ACCOUNT_NAME`
+- Connection string must contain `AccountKey=` parameter
+
+### Container Access Issues
+- Verify container exists and account has access
+- Check `AZURE_BLOB_CONTAINER_NAME` spelling
+- For anonymous access, container must be public
diff --git a/examples/azure_blob_embedding/main.py b/examples/azure_blob_embedding/main.py
new file mode 100644
index 00000000..d657ee8e
--- /dev/null
+++ b/examples/azure_blob_embedding/main.py
@@ -0,0 +1,131 @@
+from dotenv import load_dotenv
+from psycopg_pool import ConnectionPool
+import cocoindex
+import os
+from typing import Any
+
+
+@cocoindex.transform_flow()
+def text_to_embedding(
+ text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[float]]:
+ """
+ Embed the text using a SentenceTransformer model.
+ This is a shared logic between indexing and querying, so extract it as a function.
+ """
+ return text.transform(
+ cocoindex.functions.SentenceTransformerEmbed(
+ model="sentence-transformers/all-MiniLM-L6-v2"
+ )
+ )
+
+
+@cocoindex.flow_def(name="AzureBlobTextEmbedding")
+def azure_blob_text_embedding_flow(
+ flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+) -> None:
+ """
+ Define an example flow that embeds text from Azure Blob Storage into a vector database.
+ """
+ account_name = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
+ container_name = os.environ["AZURE_BLOB_CONTAINER_NAME"]
+ prefix = os.environ.get("AZURE_BLOB_PREFIX", None)
+
+ # Authentication options (in priority order)
+ connection_string = os.environ.get("AZURE_BLOB_CONNECTION_STRING", None)
+ account_key = os.environ.get("AZURE_BLOB_ACCOUNT_KEY", None)
+ sas_token = os.environ.get("AZURE_BLOB_SAS_TOKEN", None)
+
+ data_scope["documents"] = flow_builder.add_source(
+ cocoindex.sources.AzureBlob(
+ account_name=account_name,
+ container_name=container_name,
+ prefix=prefix,
+ included_patterns=["*.md", "*.mdx", "*.txt", "*.docx"],
+ binary=False,
+ connection_string=connection_string,
+ account_key=account_key,
+ sas_token=sas_token,
+ )
+ )
+
+ doc_embeddings = data_scope.add_collector()
+
+ with data_scope["documents"].row() as doc:
+ doc["chunks"] = doc["content"].transform(
+ cocoindex.functions.SplitRecursively(),
+ language="markdown",
+ chunk_size=2000,
+ chunk_overlap=500,
+ )
+
+ with doc["chunks"].row() as chunk:
+ chunk["embedding"] = text_to_embedding(chunk["text"])
+ doc_embeddings.collect(
+ filename=doc["filename"],
+ location=chunk["location"],
+ text=chunk["text"],
+ embedding=chunk["embedding"],
+ )
+
+ doc_embeddings.export(
+ "doc_embeddings",
+ cocoindex.targets.Postgres(),
+ primary_key_fields=["filename", "location"],
+ vector_indexes=[
+ cocoindex.VectorIndexDef(
+ field_name="embedding",
+ metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+ )
+ ],
+ )
+
+
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
+ # Get the table name, for the export target in the azure_blob_text_embedding_flow above.
+ table_name = cocoindex.utils.get_target_default_name(
+ azure_blob_text_embedding_flow, "doc_embeddings"
+ )
+ # Evaluate the transform flow defined above with the input query, to get the embedding.
+ query_vector = text_to_embedding.eval(query)
+ # Run the query and get the results.
+ with pool.connection() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ f"""
+ SELECT filename, text, embedding <=> %s::vector AS distance
+ FROM {table_name} ORDER BY distance LIMIT %s
+ """,
+ (query_vector, top_k),
+ )
+ return [
+ {"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
+ for row in cur.fetchall()
+ ]
+
+
+def _main() -> None:
+ # Initialize the database connection pool.
+ pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
+
+ azure_blob_text_embedding_flow.setup()
+ with cocoindex.FlowLiveUpdater(azure_blob_text_embedding_flow):
+ # Run queries in a loop to demonstrate the query capabilities.
+ while True:
+ query = input("Enter search query (or Enter to quit): ")
+ if query == "":
+ break
+ # Run the query function with the database connection pool and the query.
+ results = search(pool, query)
+ print("\nSearch results:")
+ for result in results:
+ print(f"[{result['score']:.3f}] {result['filename']}")
+ print(f" {result['text']}")
+ print("---")
+ print()
+
+
+if __name__ == "__main__":
+ load_dotenv()
+ cocoindex.init()
+ _main()
diff --git a/examples/azure_blob_embedding/pyproject.toml b/examples/azure_blob_embedding/pyproject.toml
new file mode 100644
index 00000000..ffb50203
--- /dev/null
+++ b/examples/azure_blob_embedding/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "azure-blob-text-embedding"
+version = "0.1.0"
+description = "Simple example for cocoindex: build embedding index based on Azure Blob Storage files."
+requires-python = ">=3.11"
+dependencies = ["cocoindex[embeddings]>=0.1.63", "python-dotenv>=1.0.1"]
+
+[tool.setuptools]
+packages = []
diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py
index d18f9934..0b648b38 100644
--- a/python/cocoindex/sources.py
+++ b/python/cocoindex/sources.py
@@ -43,3 +43,26 @@ class AmazonS3(op.SourceSpec):
included_patterns: list[str] | None = None
excluded_patterns: list[str] | None = None
sqs_queue_url: str | None = None
+
+
+class AzureBlob(op.SourceSpec):
+ """Import data from an Azure Blob Storage container. Supports optional prefix and file filtering by glob patterns.
+
+ Authentication options (in priority order):
+ 1. connection_string - Full connection string with credentials
+ 2. sas_token - Shared Access Signature token
+ 3. account_key - Storage account access key
+ 4. None - Anonymous access (for public containers)
+ """
+
+ _op_category = op.OpCategory.SOURCE
+
+ account_name: str
+ container_name: str
+ prefix: str | None = None
+ binary: bool = False
+ included_patterns: list[str] | None = None
+ excluded_patterns: list[str] | None = None
+ account_key: str | None = None
+ sas_token: str | None = None
+ connection_string: str | None = None
diff --git a/src/ops/registration.rs b/src/ops/registration.rs
index a7710b9c..b000ffa2 100644
--- a/src/ops/registration.rs
+++ b/src/ops/registration.rs
@@ -11,6 +11,7 @@ fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result
sources::local_file::Factory.register(registry)?;
sources::google_drive::Factory.register(registry)?;
sources::amazon_s3::Factory.register(registry)?;
+ sources::azure_blob::Factory.register(registry)?;
functions::parse_json::Factory.register(registry)?;
functions::split_recursively::register(registry)?;
diff --git a/src/ops/sources/azure_blob.rs b/src/ops/sources/azure_blob.rs
new file mode 100644
index 00000000..eed979d8
--- /dev/null
+++ b/src/ops/sources/azure_blob.rs
@@ -0,0 +1,265 @@
+use crate::fields_value;
+use async_stream::try_stream;
+use azure_storage::prelude::*;
+use azure_storage_blobs::prelude::*;
+use futures::StreamExt;
+use globset::{Glob, GlobSet, GlobSetBuilder};
+use std::sync::Arc;
+
+use crate::base::field_attrs;
+use crate::ops::sdk::*;
+
+#[derive(Debug, Deserialize)]
+pub struct Spec {
+ account_name: String,
+ container_name: String,
+ prefix: Option