diff --git a/Cargo.lock b/Cargo.lock index 4bf3fe709..cfb1f36d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2053,6 +2053,8 @@ dependencies = [ "hostname", "hyper 1.1.0", "hyper-util", + "indexify_api", + "indexify_extractor", "indexify_internal_api", "indexify_proto", "insta", @@ -2118,6 +2120,41 @@ dependencies = [ "walkdir", ] +[[package]] +name = "indexify_api" +version = "0.1.0" +dependencies = [ + "anyhow", + "axum 0.7.4", + "hyper 1.1.0", + "indexify_internal_api", + "indexify_proto", + "serde", + "serde_json", + "serde_with", + "smart-default", + "strum", + "utoipa", +] + +[[package]] +name = "indexify_extractor" +version = "0.1.0" +dependencies = [ + "anyhow", + "askama", + "async-trait", + "bollard", + "indexify_api", + "indexify_internal_api", + "mime", + "pyo3", + "serde", + "serde_json", + "tokio-stream", + "tracing", +] + [[package]] name = "indexify_internal_api" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 5b0acaba4..53ea2ff08 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ build = "build.rs" resolver = "2" [workspace] -members = [".", "crates/indexify_internal_api", "crates/indexify_proto"] +members = [".", "crates/indexify_internal_api", "crates/indexify_proto", "crates/indexify_extractor", "crates/indexify_api"] [workspace.dependencies] anyerror = "*" @@ -32,6 +32,8 @@ hostname = { version = "0.3" } hyper = { version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["service"] } itertools = "0.12" +indexify_api = { path = "crates/indexify_api" } +indexify_extractor = { path = "crates/indexify_extractor" } indexify_internal_api = { path = "crates/indexify_internal_api" } indexify_proto = { path = "crates/indexify_proto" } jsonschema = {version = "0.17", default_features=false, features=["draft202012"]} @@ -134,6 +136,8 @@ h2 = { workspace = true } hostname = { workspace = true } hyper = { workspace = true } hyper-util = { workspace = true } +indexify_api = { workspace = true } +indexify_extractor = { workspace = true } indexify_proto = { workspace = true } indexify_internal_api = { workspace = true } itertools = { workspace = true } diff --git a/crates/indexify_api/Cargo.toml b/crates/indexify_api/Cargo.toml new file mode 100644 index 000000000..a0c84c6cf --- /dev/null +++ b/crates/indexify_api/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "indexify_api" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +crate-type = ["rlib"] + +[dependencies] +anyhow = { workspace = true } +axum = { workspace = true } +hyper = { workspace = true } +indexify_internal_api = { workspace = true } +indexify_proto = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +serde_with = { workspace = true } +smart-default = { workspace = true } +strum = { workspace = true } +utoipa = { workspace = true } \ No newline at end of file diff --git a/src/api_utils.rs b/crates/indexify_api/src/api_utils.rs similarity index 99% rename from src/api_utils.rs rename to crates/indexify_api/src/api_utils.rs index 46921753c..765716812 100644 --- a/src/api_utils.rs +++ b/crates/indexify_api/src/api_utils.rs @@ -298,7 +298,7 @@ mod test_deserialize_labels_eq_filter { use hyper::Uri; use super::*; - use crate::api::ListContentFilters; + use crate::ListContentFilters; /// 1. ?source=foo&labels_eq=key:value #[test] diff --git a/crates/indexify_api/src/lib.rs b/crates/indexify_api/src/lib.rs new file mode 100644 index 000000000..ae8da28b6 --- /dev/null +++ b/crates/indexify_api/src/lib.rs @@ -0,0 +1,457 @@ +mod api_utils; + +use std::collections::HashMap; + +use anyhow::Result; +use axum::{ + http::StatusCode, + response::{IntoResponse, Response}, +}; +use indexify_internal_api as internal_api; +use indexify_proto::indexify_coordinator; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, BytesOrString}; +use smart_default::SmartDefault; +use strum::{Display, EnumString}; +use utoipa::{IntoParams, ToSchema}; + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct ExtractorBinding { + pub extractor: String, + pub name: String, + #[serde(default, deserialize_with = "api_utils::deserialize_labels_eq_filter")] + pub filters_eq: Option>, + pub input_params: Option, + pub content_source: Option, +} + +impl From for indexify_coordinator::ExtractorBinding { + fn from(value: ExtractorBinding) -> Self { + Self { + extractor: value.extractor, + name: value.name, + filters: value.filters_eq.unwrap_or_default(), + input_params: value + .input_params + .map(|v| v.to_string()) + .unwrap_or("{}".to_string()), + content_source: value.content_source.unwrap_or("ingestion".to_string()), + } + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct DataRepository { + pub name: String, + pub extractor_bindings: Vec, +} + +impl TryFrom for DataRepository { + type Error = anyhow::Error; + + fn try_from(value: indexify_coordinator::Repository) -> Result { + let mut extractor_bindings = Vec::new(); + for binding in value.bindings { + extractor_bindings.push(ExtractorBinding { + extractor: binding.extractor, + name: binding.name, + filters_eq: Some(binding.filters), + input_params: Some(serde_json::from_str(&binding.input_params)?), + content_source: Some(binding.content_source), + }); + } + Ok(Self { + name: value.name, + extractor_bindings, + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, SmartDefault, ToSchema)] +pub struct CreateRepository { + pub name: String, + pub extractor_bindings: Vec, + pub labels: HashMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct CreateRepositoryResponse {} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GetRepositoryResponse { + pub repository: DataRepository, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct ListRepositoriesResponse { + pub repositories: Vec, +} + +#[derive(Display, EnumString, Debug, Serialize, Deserialize, Clone, Default, ToSchema)] +#[serde(rename = "distance")] +pub enum IndexDistance { + #[serde(rename = "dot")] + #[strum(serialize = "dot")] + #[default] + Dot, + + #[serde(rename = "cosine")] + #[strum(serialize = "cosine")] + Cosine, + + #[serde(rename = "euclidean")] + #[strum(serialize = "euclidean")] + Euclidean, +} + +/// Request payload for creating a new vector index. +#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] +pub struct ExtractorBindRequest { + #[serde(flatten)] + pub extractor_binding: ExtractorBinding, +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct ExtractorBindResponse { + #[serde(default)] + pub index_names: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct Text { + pub text: String, + #[serde(default)] + pub labels: HashMap, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct TextAddRequest { + pub documents: Vec, + pub sync: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct RunExtractorsResponse {} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct EmbeddingSchema { + pub dim: usize, + pub distance: IndexDistance, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Display, ToSchema)] +#[serde(untagged)] +pub enum ExtractorOutputSchema { + #[serde(rename = "embedding")] + Embedding(EmbeddingSchema), + #[serde(rename = "metadata")] + Metadata(serde_json::Value), +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct ExtractorDescription { + pub name: String, + pub input_mime_types: Vec, + pub description: String, + pub input_params: serde_json::Value, + pub outputs: HashMap, +} + +impl From for indexify_coordinator::Extractor { + fn from(value: ExtractorDescription) -> Self { + let outputs = value + .outputs + .into_iter() + .map(|(k, v)| (k, v.to_string())) + .collect(); + Self { + name: value.name, + description: value.description, + input_params: value.input_params.to_string(), + outputs, + input_mime_types: value.input_mime_types, + } + } +} + +impl From for internal_api::ExtractorDescription { + fn from(extractor: ExtractorDescription) -> internal_api::ExtractorDescription { + let mut output_schema = HashMap::new(); + for (output_name, embedding_schema) in extractor.outputs { + match embedding_schema { + ExtractorOutputSchema::Embedding(embedding_schema) => { + let distance = embedding_schema.distance.to_string(); + output_schema.insert( + output_name, + internal_api::OutputSchema::Embedding(internal_api::EmbeddingSchema { + dim: embedding_schema.dim, + distance, + }), + ); + } + ExtractorOutputSchema::Metadata(schema) => { + output_schema + .insert(output_name, internal_api::OutputSchema::Attributes(schema)); + } + } + } + Self { + name: extractor.name, + description: extractor.description, + input_params: extractor.input_params, + outputs: output_schema, + input_mime_types: extractor.input_mime_types, + } + } +} + +impl TryFrom for ExtractorDescription { + type Error = anyhow::Error; + + fn try_from(value: indexify_coordinator::Extractor) -> Result { + let mut outputs = HashMap::new(); + for (k, v) in value.outputs.iter() { + let v: ExtractorOutputSchema = serde_json::from_str(v)?; + outputs.insert(k.clone(), v); + } + Ok(Self { + name: value.name, + description: value.description, + input_params: serde_json::from_str(&value.input_params).unwrap(), + outputs, + input_mime_types: value.input_mime_types, + }) + } +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct Executor { + pub id: String, + pub extractors: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct ListExecutorsResponse { + pub executors: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct ListExtractorsResponse { + pub extractors: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct TextAdditionResponse {} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct Index { + pub name: String, + pub schema: ExtractorOutputSchema, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct ListIndexesResponse { + pub indexes: Vec, +} + +#[derive(Debug, Serialize, Deserialize, IntoParams, ToSchema)] +pub struct SearchRequest { + pub index: String, + pub query: String, + pub k: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct ExtractedMetadata { + pub id: String, + pub content_id: String, + pub metadata: serde_json::Value, + pub extractor_name: String, +} + + +#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq, Clone)] +pub struct ListContentFilters { + #[serde( + deserialize_with = "api_utils::deserialize_none_to_empty_string", + default + )] + pub source: String, + #[serde( + deserialize_with = "api_utils::deserialize_none_to_empty_string", + default + )] + pub parent_id: String, + #[serde(default, deserialize_with = "api_utils::deserialize_labels_eq_filter")] + pub labels_eq: Option>, +} + +#[derive(Debug, Serialize, Deserialize, IntoParams, ToSchema)] +pub struct MetadataRequest { + pub content_id: Option, + pub index: String, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct MetadataResponse { + pub attributes: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct DocumentFragment { + pub content_id: String, + pub text: String, + pub confidence_score: f32, + pub labels: HashMap, +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct IndexSearchResponse { + pub results: Vec, +} +pub struct IndexifyAPIError { + status_code: StatusCode, + message: String, +} + +impl IndexifyAPIError { + pub fn new(status_code: StatusCode, message: String) -> Self { + Self { + status_code, + message, + } + } +} + +impl IntoResponse for IndexifyAPIError { + fn into_response(self) -> Response { + (self.status_code, self.message).into_response() + } +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct ListContentResponse { + pub content_list: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] +pub struct ContentMetadata { + pub id: String, + pub parent_id: String, + pub repository: String, + pub name: String, + pub content_type: String, + pub labels: HashMap, + pub storage_url: String, + pub created_at: i64, + pub source: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone, EnumString, ToSchema)] +pub enum FeatureType { + #[strum(serialize = "embedding")] + Embedding, + #[strum(serialize = "metadata")] + Metadata, + #[strum(serialize = "unknown")] + Unknown, +} + +impl From for FeatureType { + fn from(feature_type: internal_api::FeatureType) -> Self { + match feature_type { + internal_api::FeatureType::Embedding => FeatureType::Embedding, + internal_api::FeatureType::Metadata => FeatureType::Metadata, + internal_api::FeatureType::Unknown => FeatureType::Unknown, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] +pub struct Feature { + pub feature_type: FeatureType, + pub name: String, + pub data: serde_json::Value, +} + +impl From for Feature { + fn from(feature: internal_api::Feature) -> Self { + Self { + feature_type: feature.feature_type.into(), + name: feature.name, + data: feature.data, + } + } +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] +pub struct Content { + pub content_type: String, + #[serde_as(as = "BytesOrString")] + pub bytes: Vec, + pub features: Vec, + pub labels: HashMap, +} + +impl From for Content { + fn from(content: internal_api::Content) -> Self { + let features = content.features.into_iter().map(|f| f.into()).collect(); + Self { + content_type: content.mime, + bytes: content.bytes, + features, + labels: content.labels, + } + } +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct ExtractRequest { + pub name: String, + pub content: Content, + pub input_params: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct ExtractResponse { + pub content: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct WriteExtractedContent { + pub content_list: Vec, + pub task_id: String, + pub repository: String, + pub output_to_index_table_mapping: HashMap, + pub parent_content_id: String, + pub executor_id: String, + pub task_outcome: internal_api::TaskOutcome, + pub extractor_binding: String, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct GetRawContentResponse { + pub content_list: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListTasks { + pub repository: String, + pub extractor_binding: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListTasksResponse { + pub tasks: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListStateChanges { + pub start_at: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListStateChangesResponse { + pub state_changes: Vec, +} diff --git a/crates/indexify_extractor/Cargo.toml b/crates/indexify_extractor/Cargo.toml new file mode 100644 index 000000000..8152927ab --- /dev/null +++ b/crates/indexify_extractor/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "indexify_extractor" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +crate-type = ["rlib"] + +[dependencies] +anyhow = { workspace = true } +askama = { workspace = true } +async-trait = { workspace = true } +bollard = { workspace = true } +pyo3 = { workspace = true } +indexify_internal_api = { workspace = true } +indexify_api = { workspace = true } +mime = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tokio-stream = { workspace = true } +tracing = { workspace = true } \ No newline at end of file diff --git a/crates/indexify_extractor/askama.toml b/crates/indexify_extractor/askama.toml new file mode 100644 index 000000000..54c442dc9 --- /dev/null +++ b/crates/indexify_extractor/askama.toml @@ -0,0 +1,4 @@ +[general] +# Directories to search for templates, relative to the crate root. +dirs = ["../../dockerfiles", "../../extractor_template"] +whitespace = "preserve" diff --git a/src/extractor/extractor_runner.rs b/crates/indexify_extractor/src/extractor_runner.rs similarity index 96% rename from src/extractor/extractor_runner.rs rename to crates/indexify_extractor/src/extractor_runner.rs index e43686232..fbd32aef1 100644 --- a/src/extractor/extractor_runner.rs +++ b/crates/indexify_extractor/src/extractor_runner.rs @@ -4,10 +4,7 @@ use anyhow::{anyhow, Ok, Result}; use indexify_internal_api as internal_api; use super::ExtractorTS; -use crate::{ - api, - api::{ExtractorDescription, ExtractorOutputSchema, IndexDistance}, -}; +use indexify_api::{self as api, ExtractorDescription, ExtractorOutputSchema, IndexDistance}; #[derive(Debug)] pub struct ExtractorRunner { diff --git a/src/extractor/mod.rs b/crates/indexify_extractor/src/lib.rs similarity index 100% rename from src/extractor/mod.rs rename to crates/indexify_extractor/src/lib.rs diff --git a/src/extractor/py_extractors.rs b/crates/indexify_extractor/src/py_extractors.rs similarity index 100% rename from src/extractor/py_extractors.rs rename to crates/indexify_extractor/src/py_extractors.rs diff --git a/src/extractor/python_path.rs b/crates/indexify_extractor/src/python_path.rs similarity index 100% rename from src/extractor/python_path.rs rename to crates/indexify_extractor/src/python_path.rs diff --git a/src/extractor/scaffold.rs b/crates/indexify_extractor/src/scaffold.rs similarity index 100% rename from src/extractor/scaffold.rs rename to crates/indexify_extractor/src/scaffold.rs diff --git a/src/api.rs b/src/api.rs index 973018bf8..1039c0ad8 100644 --- a/src/api.rs +++ b/src/api.rs @@ -1,487 +1,2 @@ -use std::collections::HashMap; - -use anyhow::Result; -use axum::{ - http::StatusCode, - response::{IntoResponse, Response}, -}; -use indexify_internal_api as internal_api; -use indexify_proto::indexify_coordinator; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, BytesOrString}; -use smart_default::SmartDefault; -use strum::{Display, EnumString}; -use utoipa::{IntoParams, ToSchema}; - -use crate::{api_utils, metadata_index, vectordbs}; - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct ExtractorBinding { - pub extractor: String, - pub name: String, - #[serde(default, deserialize_with = "api_utils::deserialize_labels_eq_filter")] - pub filters_eq: Option>, - pub input_params: Option, - pub content_source: Option, -} - -impl From for indexify_coordinator::ExtractorBinding { - fn from(value: ExtractorBinding) -> Self { - Self { - extractor: value.extractor, - name: value.name, - filters: value.filters_eq.unwrap_or_default(), - input_params: value - .input_params - .map(|v| v.to_string()) - .unwrap_or("{}".to_string()), - content_source: value.content_source.unwrap_or("ingestion".to_string()), - } - } -} - -#[derive(Default, Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct DataRepository { - pub name: String, - pub extractor_bindings: Vec, -} - -impl TryFrom for DataRepository { - type Error = anyhow::Error; - - fn try_from(value: indexify_coordinator::Repository) -> Result { - let mut extractor_bindings = Vec::new(); - for binding in value.bindings { - extractor_bindings.push(ExtractorBinding { - extractor: binding.extractor, - name: binding.name, - filters_eq: Some(binding.filters), - input_params: Some(serde_json::from_str(&binding.input_params)?), - content_source: Some(binding.content_source), - }); - } - Ok(Self { - name: value.name, - extractor_bindings, - }) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, SmartDefault, ToSchema)] -pub struct CreateRepository { - pub name: String, - pub extractor_bindings: Vec, - pub labels: HashMap, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct CreateRepositoryResponse {} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GetRepositoryResponse { - pub repository: DataRepository, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct ListRepositoriesResponse { - pub repositories: Vec, -} - -#[derive(Display, EnumString, Debug, Serialize, Deserialize, Clone, Default, ToSchema)] -#[serde(rename = "distance")] -pub enum IndexDistance { - #[serde(rename = "dot")] - #[strum(serialize = "dot")] - #[default] - Dot, - - #[serde(rename = "cosine")] - #[strum(serialize = "cosine")] - Cosine, - - #[serde(rename = "euclidean")] - #[strum(serialize = "euclidean")] - Euclidean, -} - -impl From for vectordbs::IndexDistance { - fn from(value: IndexDistance) -> Self { - match value { - IndexDistance::Dot => vectordbs::IndexDistance::Dot, - IndexDistance::Cosine => vectordbs::IndexDistance::Cosine, - IndexDistance::Euclidean => vectordbs::IndexDistance::Euclidean, - } - } -} - -impl From for IndexDistance { - fn from(val: vectordbs::IndexDistance) -> Self { - match val { - vectordbs::IndexDistance::Dot => IndexDistance::Dot, - vectordbs::IndexDistance::Cosine => IndexDistance::Cosine, - vectordbs::IndexDistance::Euclidean => IndexDistance::Euclidean, - } - } -} - -/// Request payload for creating a new vector index. -#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] -pub struct ExtractorBindRequest { - #[serde(flatten)] - pub extractor_binding: ExtractorBinding, -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct ExtractorBindResponse { - #[serde(default)] - pub index_names: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct Text { - pub text: String, - #[serde(default)] - pub labels: HashMap, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct TextAddRequest { - pub documents: Vec, - pub sync: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct RunExtractorsResponse {} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct EmbeddingSchema { - pub dim: usize, - pub distance: IndexDistance, -} - -#[derive(Debug, Clone, Serialize, Deserialize, Display, ToSchema)] -#[serde(untagged)] -pub enum ExtractorOutputSchema { - #[serde(rename = "embedding")] - Embedding(EmbeddingSchema), - #[serde(rename = "metadata")] - Metadata(serde_json::Value), -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct ExtractorDescription { - pub name: String, - pub input_mime_types: Vec, - pub description: String, - pub input_params: serde_json::Value, - pub outputs: HashMap, -} - -impl From for indexify_coordinator::Extractor { - fn from(value: ExtractorDescription) -> Self { - let outputs = value - .outputs - .into_iter() - .map(|(k, v)| (k, v.to_string())) - .collect(); - Self { - name: value.name, - description: value.description, - input_params: value.input_params.to_string(), - outputs, - input_mime_types: value.input_mime_types, - } - } -} - -impl From for internal_api::ExtractorDescription { - fn from(extractor: ExtractorDescription) -> internal_api::ExtractorDescription { - let mut output_schema = HashMap::new(); - for (output_name, embedding_schema) in extractor.outputs { - match embedding_schema { - ExtractorOutputSchema::Embedding(embedding_schema) => { - let distance = embedding_schema.distance.to_string(); - output_schema.insert( - output_name, - internal_api::OutputSchema::Embedding(internal_api::EmbeddingSchema { - dim: embedding_schema.dim, - distance, - }), - ); - } - ExtractorOutputSchema::Metadata(schema) => { - output_schema - .insert(output_name, internal_api::OutputSchema::Attributes(schema)); - } - } - } - Self { - name: extractor.name, - description: extractor.description, - input_params: extractor.input_params, - outputs: output_schema, - input_mime_types: extractor.input_mime_types, - } - } -} - -impl TryFrom for ExtractorDescription { - type Error = anyhow::Error; - - fn try_from(value: indexify_coordinator::Extractor) -> Result { - let mut outputs = HashMap::new(); - for (k, v) in value.outputs.iter() { - let v: ExtractorOutputSchema = serde_json::from_str(v)?; - outputs.insert(k.clone(), v); - } - Ok(Self { - name: value.name, - description: value.description, - input_params: serde_json::from_str(&value.input_params).unwrap(), - outputs, - input_mime_types: value.input_mime_types, - }) - } -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct Executor { - pub id: String, - pub extractors: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct ListExecutorsResponse { - pub executors: Vec, -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct ListExtractorsResponse { - pub extractors: Vec, -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct TextAdditionResponse {} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct Index { - pub name: String, - pub schema: ExtractorOutputSchema, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct ListIndexesResponse { - pub indexes: Vec, -} - -#[derive(Debug, Serialize, Deserialize, IntoParams, ToSchema)] -pub struct SearchRequest { - pub index: String, - pub query: String, - pub k: Option, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct ExtractedMetadata { - pub id: String, - pub content_id: String, - pub metadata: serde_json::Value, - pub extractor_name: String, -} - -impl From for ExtractedMetadata { - fn from(value: metadata_index::ExtractedMetadata) -> Self { - Self { - id: value.id, - content_id: value.content_id, - metadata: value.metadata, - extractor_name: value.extractor_name, - } - } -} - -#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq, Clone)] -pub struct ListContentFilters { - #[serde( - deserialize_with = "api_utils::deserialize_none_to_empty_string", - default - )] - pub source: String, - #[serde( - deserialize_with = "api_utils::deserialize_none_to_empty_string", - default - )] - pub parent_id: String, - #[serde(default, deserialize_with = "api_utils::deserialize_labels_eq_filter")] - pub labels_eq: Option>, -} - -#[derive(Debug, Serialize, Deserialize, IntoParams, ToSchema)] -pub struct MetadataRequest { - pub content_id: Option, - pub index: String, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct MetadataResponse { - pub attributes: Vec, -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct DocumentFragment { - pub content_id: String, - pub text: String, - pub confidence_score: f32, - pub labels: HashMap, -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct IndexSearchResponse { - pub results: Vec, -} -pub struct IndexifyAPIError { - status_code: StatusCode, - message: String, -} - -impl IndexifyAPIError { - pub fn new(status_code: StatusCode, message: String) -> Self { - Self { - status_code, - message, - } - } -} - -impl IntoResponse for IndexifyAPIError { - fn into_response(self) -> Response { - (self.status_code, self.message).into_response() - } -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct ListContentResponse { - pub content_list: Vec, -} - -#[derive(Debug, Serialize, Deserialize, Default, ToSchema)] -pub struct ContentMetadata { - pub id: String, - pub parent_id: String, - pub repository: String, - pub name: String, - pub content_type: String, - pub labels: HashMap, - pub storage_url: String, - pub created_at: i64, - pub source: String, -} - -#[derive(Debug, Serialize, Deserialize, Clone, EnumString, ToSchema)] -pub enum FeatureType { - #[strum(serialize = "embedding")] - Embedding, - #[strum(serialize = "metadata")] - Metadata, - #[strum(serialize = "unknown")] - Unknown, -} - -impl From for FeatureType { - fn from(feature_type: internal_api::FeatureType) -> Self { - match feature_type { - internal_api::FeatureType::Embedding => FeatureType::Embedding, - internal_api::FeatureType::Metadata => FeatureType::Metadata, - internal_api::FeatureType::Unknown => FeatureType::Unknown, - } - } -} - -#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] -pub struct Feature { - pub feature_type: FeatureType, - pub name: String, - pub data: serde_json::Value, -} - -impl From for Feature { - fn from(feature: internal_api::Feature) -> Self { - Self { - feature_type: feature.feature_type.into(), - name: feature.name, - data: feature.data, - } - } -} - -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] -pub struct Content { - pub content_type: String, - #[serde_as(as = "BytesOrString")] - pub bytes: Vec, - pub features: Vec, - pub labels: HashMap, -} - -impl From for Content { - fn from(content: internal_api::Content) -> Self { - let features = content.features.into_iter().map(|f| f.into()).collect(); - Self { - content_type: content.mime, - bytes: content.bytes, - features, - labels: content.labels, - } - } -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct ExtractRequest { - pub name: String, - pub content: Content, - pub input_params: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct ExtractResponse { - pub content: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct WriteExtractedContent { - pub content_list: Vec, - pub task_id: String, - pub repository: String, - pub output_to_index_table_mapping: HashMap, - pub parent_content_id: String, - pub executor_id: String, - pub task_outcome: internal_api::TaskOutcome, - pub extractor_binding: String, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct GetRawContentResponse { - pub content_list: Vec, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct ListTasks { - pub repository: String, - pub extractor_binding: Option, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct ListTasksResponse { - pub tasks: Vec, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct ListStateChanges { - pub start_at: Option, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct ListStateChangesResponse { - pub state_changes: Vec, -} +/// Moved to crates/indexify_api +pub use indexify_api::*; \ No newline at end of file diff --git a/src/data_repository_manager.rs b/src/data_repository_manager.rs index 9189b2236..1d295a523 100644 --- a/src/data_repository_manager.rs +++ b/src/data_repository_manager.rs @@ -63,7 +63,7 @@ impl DataRepositoryManager { } #[tracing::instrument] - pub async fn list_repositories(&self) -> Result> { + pub async fn list_repositories(&self) -> Result> { let req = indexify_coordinator::ListRepositoriesRequest {}; let response = self .coordinator_client @@ -74,7 +74,7 @@ impl DataRepositoryManager { let repositories = response.into_inner().repositories; let data_respoistories = repositories .into_iter() - .map(|r| api::DataRepository { + .map(|r| indexify_api::DataRepository { name: r.name, extractor_bindings: Vec::new(), }) diff --git a/src/extractor.rs b/src/extractor.rs new file mode 100644 index 000000000..58e5bd91c --- /dev/null +++ b/src/extractor.rs @@ -0,0 +1,2 @@ +/// Moved to crates/indexify_extractor +pub use indexify_extractor::*; \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 8cda93d6c..fb8a1a56f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,7 +17,6 @@ pub mod state; pub mod task_allocator; mod api; -mod api_utils; mod blob_storage; mod caching; mod cmd; diff --git a/src/metadata_index.rs b/src/metadata_index.rs index a89b93328..cf79a8056 100644 --- a/src/metadata_index.rs +++ b/src/metadata_index.rs @@ -11,6 +11,7 @@ use serde::{Deserialize, Serialize}; use sqlx::{postgres::PgPoolOptions, Pool, Postgres, Row}; use crate::{ + api, coordinator_client::CoordinatorClient, grpc_helper::GrpcHelper, utils::{timestamp_secs, PostgresIndexName}, @@ -182,3 +183,14 @@ impl MetadataIndexManager { Ok(extracted_attributes) } } + +impl From for api::ExtractedMetadata { + fn from(value: ExtractedMetadata) -> Self { + Self { + id: value.id, + content_id: value.content_id, + metadata: value.metadata, + extractor_name: value.extractor_name, + } + } +} \ No newline at end of file diff --git a/src/text_splitters/mod.rs b/src/text_splitters/mod.rs index dd5a97d10..56571a1cb 100644 --- a/src/text_splitters/mod.rs +++ b/src/text_splitters/mod.rs @@ -253,3 +253,23 @@ mod tests { assert_eq!(chunks1.len(), 1); } } + +impl From for IndexDistance { + fn from(value: IndexDistance) -> Self { + match value { + crate::api::IndexDistance::Dot => IndexDistance::Dot, + crate::api::IndexDistance::Cosine => IndexDistance::Cosine, + crate::api::IndexDistance::Euclidean => IndexDistance::Euclidean, + } + } +} + +impl From for crate::api::IndexDistance { + fn from(val: IndexDistance) -> Self { + match val { + IndexDistance::Dot => crate::api::IndexDistance::Dot, + IndexDistance::Cosine => crate::api::IndexDistance::Cosine, + IndexDistance::Euclidean => crate::api::IndexDistance::Euclidean, + } + } +}