diff --git a/terraform/bedrock-knowledge-base/README.md b/terraform/bedrock-knowledge-base/README.md new file mode 100644 index 0000000..3a76f79 --- /dev/null +++ b/terraform/bedrock-knowledge-base/README.md @@ -0,0 +1,66 @@ +# AWS Bedrock Knowledge Base + +Production-ready Terraform module for creating Amazon Bedrock Knowledge Bases with vector database integration for Retrieval-Augmented Generation (RAG). + +## Features + +- Vector Knowledge Bases Enable RAG with foundation models +- Multiple Storage Backends OpenSearch Serverless, RDS (Aurora PostgreSQL), or Pinecone +- S3 Data Sources Automatic document ingestion from S3 +- Chunking Strategies Fixed-size or no chunking for documents +- Embedding Models Amazon Titan, Cohere, and more +- IAM Management Automatic role and policy creation +- Document Filtering S3 inclusion prefixes for selective indexing +- Comprehensive Outputs Knowledge base IDs, ARNs, and ingestion examples + +## Quick Start + +```hcl +module "knowledge_base" { + source = "github.com/llamandcoco/infra-modules//terraform/bedrock-knowledge-base?ref=" + + name = "product-docs-kb" + embedding_model_arn = "arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v1" + + opensearch_serverless_configuration = { + collection_arn = aws_opensearchserverless_collection.kb.arn + vector_index_name = "bedrock-knowledge-base-default-index" + metadata_field = "AMAZON_BEDROCK_METADATA" + text_field = "AMAZON_BEDROCK_TEXT_CHUNK" + vector_field = "bedrock-knowledge-base-default-vector" + } + + s3_data_source_bucket_arn = aws_s3_bucket.docs.arn +} +``` + +## Examples + +Complete, tested configurations in [`tests/`](tests/): + +| Example | Directory | +|---------|----------| +| Basic | [`tests/basic/main.tf`](tests/basic/main.tf) | +| Advanced | [`tests/advanced/main.tf`](tests/advanced/main.tf) | + +**Usage:** +```bash +# View example +cat tests/basic/ + +# Copy and adapt +cp -r tests/basic/ my-project/ +``` + +## Testing + +```bash +cd tests/basic && terraform init && terraform plan +``` + +
+Terraform Documentation + + + +
diff --git a/terraform/bedrock-knowledge-base/main.tf b/terraform/bedrock-knowledge-base/main.tf new file mode 100644 index 0000000..42dd71f --- /dev/null +++ b/terraform/bedrock-knowledge-base/main.tf @@ -0,0 +1,275 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +# ----------------------------------------------------------------------------- +# Local Variables +# ----------------------------------------------------------------------------- + +locals { + # Common tags to apply to all resources + common_tags = merge( + var.tags, + { + ManagedBy = "Terraform" + Module = "bedrock-knowledge-base" + } + ) + + # Determine storage configuration type + storage_type = var.opensearch_serverless_configuration != null ? "OPENSEARCH_SERVERLESS" : ( + var.rds_configuration != null ? "RDS" : ( + var.pinecone_configuration != null ? "PINECONE" : "OPENSEARCH_SERVERLESS" + ) + ) +} + + +# ----------------------------------------------------------------------------- +# IAM Role for Knowledge Base +# Allows Bedrock to access the data source and vector database +# ----------------------------------------------------------------------------- +resource "aws_iam_role" "knowledge_base" { + name = var.kb_role_name != null ? var.kb_role_name : "${var.name}-kb-role" + description = "IAM role for Bedrock Knowledge Base ${var.name}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "bedrock.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) + + tags = merge( + local.common_tags, + { + Name = var.kb_role_name != null ? var.kb_role_name : "${var.name}-kb-role" + } + ) +} + +# ----------------------------------------------------------------------------- +# IAM Policy for Foundation Model Access (for embeddings) +# ----------------------------------------------------------------------------- +resource "aws_iam_role_policy" "bedrock_model" { + name = "bedrock-model-policy" + role = aws_iam_role.knowledge_base.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "bedrock:InvokeModel" + ] + Resource = var.embedding_model_arn + } + ] + }) +} + +# ----------------------------------------------------------------------------- +# IAM Policy for S3 Data Source Access +# ----------------------------------------------------------------------------- +resource "aws_iam_role_policy" "s3_data_source" { + count = var.s3_data_source_bucket_arn != null ? 1 : 0 + + name = "s3-data-source-policy" + role = aws_iam_role.knowledge_base.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:GetObject", + "s3:ListBucket" + ] + Resource = [ + var.s3_data_source_bucket_arn, + "${var.s3_data_source_bucket_arn}/*" + ] + } + ] + }) +} + +# ----------------------------------------------------------------------------- +# IAM Policy for OpenSearch Serverless Access +# ----------------------------------------------------------------------------- +resource "aws_iam_role_policy" "opensearch" { + count = var.opensearch_serverless_configuration != null ? 1 : 0 + + name = "opensearch-policy" + role = aws_iam_role.knowledge_base.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "aoss:APIAccessAll" + ] + Resource = var.opensearch_serverless_configuration.collection_arn + } + ] + }) +} + +# ----------------------------------------------------------------------------- +# Bedrock Knowledge Base +# ----------------------------------------------------------------------------- +resource "aws_bedrockagent_knowledge_base" "this" { + name = var.name + description = var.description + role_arn = aws_iam_role.knowledge_base.arn + + knowledge_base_configuration { + type = "VECTOR" + + vector_knowledge_base_configuration { + embedding_model_arn = var.embedding_model_arn + } + } + + storage_configuration { + type = local.storage_type + + # OpenSearch Serverless configuration + dynamic "opensearch_serverless_configuration" { + for_each = var.opensearch_serverless_configuration != null ? [var.opensearch_serverless_configuration] : [] + + content { + collection_arn = opensearch_serverless_configuration.value.collection_arn + vector_index_name = opensearch_serverless_configuration.value.vector_index_name + + field_mapping { + metadata_field = opensearch_serverless_configuration.value.metadata_field + text_field = opensearch_serverless_configuration.value.text_field + vector_field = opensearch_serverless_configuration.value.vector_field + } + } + } + + # RDS configuration + dynamic "rds_configuration" { + for_each = var.rds_configuration != null ? [var.rds_configuration] : [] + + content { + credentials_secret_arn = rds_configuration.value.credentials_secret_arn + database_name = rds_configuration.value.database_name + resource_arn = rds_configuration.value.resource_arn + table_name = rds_configuration.value.table_name + + field_mapping { + metadata_field = rds_configuration.value.metadata_field + primary_key_field = rds_configuration.value.primary_key_field + text_field = rds_configuration.value.text_field + vector_field = rds_configuration.value.vector_field + } + } + } + + # Pinecone configuration + dynamic "pinecone_configuration" { + for_each = var.pinecone_configuration != null ? [var.pinecone_configuration] : [] + + content { + connection_string = pinecone_configuration.value.connection_string + credentials_secret_arn = pinecone_configuration.value.credentials_secret_arn + namespace = pinecone_configuration.value.namespace + + field_mapping { + metadata_field = pinecone_configuration.value.metadata_field + text_field = pinecone_configuration.value.text_field + } + } + } + } + + tags = merge( + local.common_tags, + { + Name = var.name + } + ) + + depends_on = [ + aws_iam_role_policy.bedrock_model, + aws_iam_role_policy.s3_data_source, + aws_iam_role_policy.opensearch + ] +} + +# ----------------------------------------------------------------------------- +# Data Source (S3) +# Creates a data source pointing to S3 bucket with documents +# ----------------------------------------------------------------------------- +resource "aws_bedrockagent_data_source" "this" { + count = var.s3_data_source_bucket_arn != null ? 1 : 0 + + knowledge_base_id = aws_bedrockagent_knowledge_base.this.id + name = var.data_source_name != null ? var.data_source_name : "${var.name}-s3-data-source" + description = var.data_source_description + + data_source_configuration { + type = "S3" + + s3_configuration { + bucket_arn = var.s3_data_source_bucket_arn + + dynamic "bucket_owner_account_id" { + for_each = var.s3_bucket_owner_account_id != null ? [var.s3_bucket_owner_account_id] : [] + content { + bucket_owner_account_id = bucket_owner_account_id.value + } + } + + dynamic "inclusion_prefixes" { + for_each = length(var.s3_inclusion_prefixes) > 0 ? [1] : [] + content { + inclusion_prefixes = var.s3_inclusion_prefixes + } + } + } + } + + # Vector ingestion configuration + dynamic "vector_ingestion_configuration" { + for_each = var.chunking_strategy != null ? [1] : [] + + content { + chunking_configuration { + chunking_strategy = var.chunking_strategy + + # Fixed size chunking + dynamic "fixed_size_chunking_configuration" { + for_each = var.chunking_strategy == "FIXED_SIZE" ? [1] : [] + + content { + max_tokens = var.fixed_size_max_tokens + overlap_percentage = var.fixed_size_overlap_percentage + } + } + } + } + } + + data_deletion_policy = var.data_deletion_policy +} diff --git a/terraform/bedrock-knowledge-base/outputs.tf b/terraform/bedrock-knowledge-base/outputs.tf new file mode 100644 index 0000000..0e9e1d8 --- /dev/null +++ b/terraform/bedrock-knowledge-base/outputs.tf @@ -0,0 +1,166 @@ +# ----------------------------------------------------------------------------- +# Knowledge Base Outputs +# ----------------------------------------------------------------------------- + +output "knowledge_base_id" { + description = "The unique identifier of the Bedrock knowledge base. Use this to associate with agents or query directly." + value = aws_bedrockagent_knowledge_base.this.id +} + +output "knowledge_base_arn" { + description = "The ARN of the Bedrock knowledge base. Use this for IAM policies and cross-account access." + value = aws_bedrockagent_knowledge_base.this.arn +} + +output "knowledge_base_name" { + description = "The name of the Bedrock knowledge base." + value = aws_bedrockagent_knowledge_base.this.name +} + +# ----------------------------------------------------------------------------- +# Data Source Outputs +# ----------------------------------------------------------------------------- + +output "data_source_id" { + description = "The ID of the data source. Use this to trigger ingestion jobs." + value = var.s3_data_source_bucket_arn != null ? aws_bedrockagent_data_source.this[0].data_source_id : null +} + +output "data_source_name" { + description = "The name of the data source." + value = var.s3_data_source_bucket_arn != null ? aws_bedrockagent_data_source.this[0].name : null +} + +# ----------------------------------------------------------------------------- +# IAM Role Outputs +# ----------------------------------------------------------------------------- + +output "kb_role_arn" { + description = "The ARN of the IAM role used by the knowledge base." + value = aws_iam_role.knowledge_base.arn +} + +output "kb_role_name" { + description = "The name of the IAM role used by the knowledge base." + value = aws_iam_role.knowledge_base.name +} + +output "kb_role_id" { + description = "The unique ID of the IAM role." + value = aws_iam_role.knowledge_base.id +} + +# ----------------------------------------------------------------------------- +# Configuration Outputs +# ----------------------------------------------------------------------------- + +output "embedding_model_arn" { + description = "The ARN of the embedding model used by the knowledge base." + value = var.embedding_model_arn +} + +output "storage_type" { + description = "The type of vector database storage used (OPENSEARCH_SERVERLESS, RDS, or PINECONE)." + value = local.storage_type +} + +output "chunking_strategy" { + description = "The chunking strategy used for document processing." + value = var.chunking_strategy +} + +# ----------------------------------------------------------------------------- +# Region and Account Outputs +# ----------------------------------------------------------------------------- + +output "region" { + description = "The AWS region where the knowledge base is deployed." + value = "*" +} + +output "account_id" { + description = "The AWS account ID where the knowledge base is deployed." + value = data.aws_caller_identity.current.account_id +} + +# ----------------------------------------------------------------------------- +# Common Embedding Model ARNs +# ----------------------------------------------------------------------------- + +output "titan_embed_text_v1_arn" { + description = "ARN for Amazon Titan Text Embeddings v1 model." + value = "arn:aws:bedrock:${"*"}::foundation-model/amazon.titan-embed-text-v1" +} + +output "titan_embed_text_v2_arn" { + description = "ARN for Amazon Titan Text Embeddings v2 model." + value = "arn:aws:bedrock:${"*"}::foundation-model/amazon.titan-embed-text-v2:0" +} + +output "cohere_embed_english_v3_arn" { + description = "ARN for Cohere Embed English v3 model." + value = "arn:aws:bedrock:${"*"}::foundation-model/cohere.embed-english-v3" +} + +output "cohere_embed_multilingual_v3_arn" { + description = "ARN for Cohere Embed Multilingual v3 model." + value = "arn:aws:bedrock:${"*"}::foundation-model/cohere.embed-multilingual-v3" +} + +# ----------------------------------------------------------------------------- +# Ingestion Job Commands +# ----------------------------------------------------------------------------- + +output "start_ingestion_job_cli_example" { + description = "AWS CLI example command to start an ingestion job for the data source." + value = var.s3_data_source_bucket_arn != null ? ( + "aws bedrock-agent start-ingestion-job --knowledge-base-id ${aws_bedrockagent_knowledge_base.this.id} --data-source-id ${aws_bedrockagent_data_source.this[0].data_source_id}" + ) : "No data source configured" +} + +output "boto3_ingestion_example" { + description = "Python boto3 example code to start an ingestion job." + value = var.s3_data_source_bucket_arn != null ? <<-EOT + import boto3 + + bedrock_agent = boto3.client('bedrock-agent') + + response = bedrock_agent.start_ingestion_job( + knowledgeBaseId='${aws_bedrockagent_knowledge_base.this.id}', + dataSourceId='${aws_bedrockagent_data_source.this[0].data_source_id}' + ) + + print(f"Ingestion job started: {response['ingestionJob']['ingestionJobId']}") + EOT + : "No data source configured" +} + +output "retrieve_query_cli_example" { + description = "AWS CLI example command to query the knowledge base." + value = <<-EOT + aws bedrock-agent-runtime retrieve \ + --knowledge-base-id ${aws_bedrockagent_knowledge_base.this.id} \ + --retrieval-query text="Your search query here" + EOT +} + +output "boto3_retrieve_example" { + description = "Python boto3 example code to query the knowledge base." + value = <<-EOT + import boto3 + + bedrock_agent_runtime = boto3.client('bedrock-agent-runtime') + + response = bedrock_agent_runtime.retrieve( + knowledgeBaseId='${aws_bedrockagent_knowledge_base.this.id}', + retrievalQuery={ + 'text': 'Your search query here' + } + ) + + for result in response['retrievalResults']: + print(f"Score: {result['score']}") + print(f"Content: {result['content']['text']}") + print("---") + EOT +} diff --git a/terraform/bedrock-knowledge-base/tests/advanced/main.tf b/terraform/bedrock-knowledge-base/tests/advanced/main.tf new file mode 100644 index 0000000..5ae8c35 --- /dev/null +++ b/terraform/bedrock-knowledge-base/tests/advanced/main.tf @@ -0,0 +1,296 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +# ----------------------------------------------------------------------------- +# Advanced Bedrock Knowledge Base Test +# Tests comprehensive knowledge base configuration with: +# - OpenSearch Serverless with custom field mappings +# - S3 data source with inclusion prefixes +# - Custom chunking configuration +# - Multiple tags and metadata +# ----------------------------------------------------------------------------- + +provider "aws" { + region = "us-west-2" + + # Mock configuration for testing - no real AWS credentials needed for plan + skip_credentials_validation = true + skip_metadata_api_check = true + skip_requesting_account_id = true + + endpoints { + bedrock = "http://localhost:4566" + bedrockagent = "http://localhost:4566" + iam = "http://localhost:4566" + sts = "http://localhost:4566" + aoss = "http://localhost:4566" + s3 = "http://localhost:4566" + kms = "http://localhost:4566" + } +} + +# ----------------------------------------------------------------------------- +# S3 Bucket for Documents +# ----------------------------------------------------------------------------- + +resource "aws_s3_bucket" "documents" { + bucket = "advanced-kb-enterprise-docs" +} + +# Upload sample documents (mocked for testing) +resource "aws_s3_bucket_versioning" "documents" { + bucket = aws_s3_bucket.documents.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "documents" { + bucket = aws_s3_bucket.documents.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +# ----------------------------------------------------------------------------- +# OpenSearch Serverless Collection +# ----------------------------------------------------------------------------- + +# Encryption policy for the collection +resource "aws_opensearchserverless_security_policy" "encryption" { + name = "advanced-kb-encryption-policy" + type = "encryption" + + policy = jsonencode({ + Rules = [ + { + Resource = [ + "collection/advanced-kb-*" + ] + ResourceType = "collection" + } + ] + AWSOwnedKey = true + }) +} + +# Network policy for the collection +resource "aws_opensearchserverless_security_policy" "network" { + name = "advanced-kb-network-policy" + type = "network" + + policy = jsonencode([ + { + Rules = [ + { + Resource = [ + "collection/advanced-kb-*" + ] + ResourceType = "collection" + } + ] + AllowFromPublic = true + } + ]) +} + +# Data access policy +resource "aws_opensearchserverless_access_policy" "data" { + name = "advanced-kb-data-policy" + type = "data" + + policy = jsonencode([ + { + Rules = [ + { + Resource = [ + "collection/advanced-kb-*" + ] + Permission = [ + "aoss:CreateCollectionItems", + "aoss:UpdateCollectionItems", + "aoss:DescribeCollectionItems" + ] + ResourceType = "collection" + }, + { + Resource = [ + "index/advanced-kb-*/*" + ] + Permission = [ + "aoss:CreateIndex", + "aoss:UpdateIndex", + "aoss:DescribeIndex", + "aoss:ReadDocument", + "aoss:WriteDocument" + ] + ResourceType = "index" + } + ] + Principal = [ + "*" + ] + } + ]) +} + +# OpenSearch Serverless collection +resource "aws_opensearchserverless_collection" "kb" { + name = "advanced-kb-collection" + type = "VECTORSEARCH" + + depends_on = [ + aws_opensearchserverless_security_policy.encryption, + aws_opensearchserverless_security_policy.network + ] +} + +# ----------------------------------------------------------------------------- +# Bedrock Knowledge Base with Advanced Configuration +# ----------------------------------------------------------------------------- + +module "bedrock_knowledge_base" { + source = "../.." + + # Knowledge base configuration + name = "advanced-enterprise-kb" + description = "Advanced enterprise knowledge base with comprehensive documentation and policies" + kb_role_name = "advanced-enterprise-kb-role" + + # Embedding model - Cohere Multilingual for better global support + embedding_model_arn = "arn:aws:bedrock:us-west-2::foundation-model/cohere.embed-multilingual-v3" + + # OpenSearch Serverless storage with custom field mappings + opensearch_serverless_configuration = { + collection_arn = aws_opensearchserverless_collection.kb.arn + vector_index_name = "enterprise-vector-index" + metadata_field = "metadata" + text_field = "text_content" + vector_field = "embedding_vector" + } + + # S3 data source with inclusion prefixes + s3_data_source_bucket_arn = aws_s3_bucket.documents.arn + data_source_name = "enterprise-docs-source" + data_source_description = "Enterprise documentation including product manuals, policies, and guides" + + # Only include specific document types + s3_inclusion_prefixes = [ + "docs/product-manuals/", + "docs/policies/", + "docs/guides/" + ] + + # Custom chunking configuration for larger context + chunking_strategy = "FIXED_SIZE" + fixed_size_max_tokens = 512 + fixed_size_overlap_percentage = 25 + + # Data deletion policy + data_deletion_policy = "DELETE" + + tags = { + Environment = "production" + Purpose = "advanced-kb-test" + Team = "knowledge-management" + CostCenter = "engineering" + Compliance = "required" + DataClass = "internal" + } + + depends_on = [ + aws_opensearchserverless_collection.kb, + aws_opensearchserverless_access_policy.data + ] +} + +# ----------------------------------------------------------------------------- +# Outputs +# ----------------------------------------------------------------------------- + +output "knowledge_base_id" { + description = "ID of the knowledge base" + value = module.bedrock_knowledge_base.knowledge_base_id +} + +output "knowledge_base_arn" { + description = "ARN of the knowledge base" + value = module.bedrock_knowledge_base.knowledge_base_arn +} + +output "data_source_id" { + description = "ID of the data source" + value = module.bedrock_knowledge_base.data_source_id +} + +output "kb_role_arn" { + description = "ARN of the knowledge base IAM role" + value = module.bedrock_knowledge_base.kb_role_arn +} + +output "storage_type" { + description = "Vector database storage type" + value = module.bedrock_knowledge_base.storage_type +} + +output "chunking_strategy" { + description = "Document chunking strategy" + value = module.bedrock_knowledge_base.chunking_strategy +} + +output "opensearch_collection_endpoint" { + description = "OpenSearch Serverless collection endpoint" + value = aws_opensearchserverless_collection.kb.collection_endpoint +} + +output "opensearch_dashboard_endpoint" { + description = "OpenSearch Serverless dashboard endpoint" + value = aws_opensearchserverless_collection.kb.dashboard_endpoint +} + +output "start_ingestion_cli" { + description = "CLI command to start ingestion" + value = module.bedrock_knowledge_base.start_ingestion_job_cli_example +} + +output "boto3_ingestion_example" { + description = "Python boto3 code to start ingestion" + value = module.bedrock_knowledge_base.boto3_ingestion_example +} + +output "retrieve_query_cli" { + description = "CLI command to query the knowledge base" + value = module.bedrock_knowledge_base.retrieve_query_cli_example +} + +output "boto3_retrieve_example" { + description = "Python boto3 code to query the knowledge base" + value = module.bedrock_knowledge_base.boto3_retrieve_example +} + +output "s3_bucket_name" { + description = "Name of the S3 bucket for documents" + value = aws_s3_bucket.documents.id +} + +output "embedding_models" { + description = "Available embedding model ARNs" + value = { + titan_v1 = module.bedrock_knowledge_base.titan_embed_text_v1_arn + titan_v2 = module.bedrock_knowledge_base.titan_embed_text_v2_arn + cohere_english = module.bedrock_knowledge_base.cohere_embed_english_v3_arn + cohere_multilingual = module.bedrock_knowledge_base.cohere_embed_multilingual_v3_arn + } +} diff --git a/terraform/bedrock-knowledge-base/tests/basic/main.tf b/terraform/bedrock-knowledge-base/tests/basic/main.tf new file mode 100644 index 0000000..09ee95e --- /dev/null +++ b/terraform/bedrock-knowledge-base/tests/basic/main.tf @@ -0,0 +1,113 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +# ----------------------------------------------------------------------------- +# Basic Bedrock Knowledge Base Test +# Tests minimal knowledge base configuration with OpenSearch Serverless +# Creates a knowledge base with S3 data source +# ----------------------------------------------------------------------------- + +provider "aws" { + region = "us-east-1" + + # Mock configuration for testing - no real AWS credentials needed for plan + skip_credentials_validation = true + skip_metadata_api_check = true + skip_requesting_account_id = true + + endpoints { + bedrock = "http://localhost:4566" + bedrockagent = "http://localhost:4566" + iam = "http://localhost:4566" + sts = "http://localhost:4566" + aoss = "http://localhost:4566" + s3 = "http://localhost:4566" + } +} + +# Mock S3 bucket for documents +resource "aws_s3_bucket" "documents" { + bucket = "basic-kb-documents-bucket" +} + +# Mock OpenSearch Serverless collection +resource "aws_opensearchserverless_collection" "kb" { + name = "basic-kb-collection" + type = "VECTORSEARCH" +} + +# Create basic knowledge base +module "bedrock_knowledge_base" { + source = "../.." + + # Knowledge base configuration + name = "basic-product-docs-kb" + description = "Basic knowledge base for product documentation" + + # Embedding model - Amazon Titan + embedding_model_arn = "arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-embed-text-v1" + + # OpenSearch Serverless storage + opensearch_serverless_configuration = { + collection_arn = aws_opensearchserverless_collection.kb.arn + vector_index_name = "bedrock-knowledge-base-default-index" + metadata_field = "AMAZON_BEDROCK_METADATA" + text_field = "AMAZON_BEDROCK_TEXT_CHUNK" + vector_field = "bedrock-knowledge-base-default-vector" + } + + # S3 data source + s3_data_source_bucket_arn = aws_s3_bucket.documents.arn + + # Use default chunking configuration + # - chunking_strategy: FIXED_SIZE + # - fixed_size_max_tokens: 300 + # - fixed_size_overlap_percentage: 20 + + tags = { + Environment = "test" + Purpose = "basic-kb-test" + } +} + +# ----------------------------------------------------------------------------- +# Outputs +# ----------------------------------------------------------------------------- + +output "knowledge_base_id" { + description = "ID of the knowledge base" + value = module.bedrock_knowledge_base.knowledge_base_id +} + +output "knowledge_base_arn" { + description = "ARN of the knowledge base" + value = module.bedrock_knowledge_base.knowledge_base_arn +} + +output "data_source_id" { + description = "ID of the data source" + value = module.bedrock_knowledge_base.data_source_id +} + +output "kb_role_arn" { + description = "ARN of the knowledge base IAM role" + value = module.bedrock_knowledge_base.kb_role_arn +} + +output "start_ingestion_cli" { + description = "CLI command to start ingestion" + value = module.bedrock_knowledge_base.start_ingestion_job_cli_example +} + +output "retrieve_query_cli" { + description = "CLI command to query the knowledge base" + value = module.bedrock_knowledge_base.retrieve_query_cli_example +} diff --git a/terraform/bedrock-knowledge-base/variables.tf b/terraform/bedrock-knowledge-base/variables.tf new file mode 100644 index 0000000..e38d00f --- /dev/null +++ b/terraform/bedrock-knowledge-base/variables.tf @@ -0,0 +1,180 @@ +# ----------------------------------------------------------------------------- +# Required Variables +# ----------------------------------------------------------------------------- + +variable "name" { + description = "Name of the Bedrock knowledge base. This will be displayed in the AWS console and used in resource naming." + type = string + + validation { + condition = length(var.name) > 0 && length(var.name) <= 100 + error_message = "Knowledge base name must be between 1 and 100 characters." + } + + validation { + condition = can(regex("^[a-zA-Z0-9-_]+$", var.name)) + error_message = "Knowledge base name must contain only alphanumeric characters, hyphens, and underscores." + } +} + +variable "embedding_model_arn" { + description = "ARN of the foundation model to use for generating embeddings. Common models: 'arn:aws:bedrock:::foundation-model/amazon.titan-embed-text-v1', 'arn:aws:bedrock:::foundation-model/cohere.embed-english-v3', 'arn:aws:bedrock:::foundation-model/cohere.embed-multilingual-v3'." + type = string + + validation { + condition = length(var.embedding_model_arn) > 0 + error_message = "Embedding model ARN must not be empty." + } +} + +# ----------------------------------------------------------------------------- +# Knowledge Base Configuration +# ----------------------------------------------------------------------------- + +variable "description" { + description = "Description of the knowledge base. Helps document the purpose and contents." + type = string + default = null +} + +variable "kb_role_name" { + description = "Name of the IAM role for the knowledge base. If not specified, defaults to '-kb-role'." + type = string + default = null +} + +# ----------------------------------------------------------------------------- +# Storage Configuration Variables +# Only one of these should be specified +# ----------------------------------------------------------------------------- + +variable "opensearch_serverless_configuration" { + description = "Configuration for OpenSearch Serverless as the vector database. This is the recommended option for most use cases." + type = object({ + collection_arn = string + vector_index_name = string + metadata_field = string + text_field = string + vector_field = string + }) + default = null +} + +variable "rds_configuration" { + description = "Configuration for RDS (Aurora PostgreSQL with pgvector) as the vector database." + type = object({ + credentials_secret_arn = string + database_name = string + resource_arn = string + table_name = string + metadata_field = string + primary_key_field = string + text_field = string + vector_field = string + }) + default = null +} + +variable "pinecone_configuration" { + description = "Configuration for Pinecone as the vector database." + type = object({ + connection_string = string + credentials_secret_arn = string + namespace = string + metadata_field = string + text_field = string + }) + default = null +} + +# ----------------------------------------------------------------------------- +# Data Source Configuration (S3) +# ----------------------------------------------------------------------------- + +variable "s3_data_source_bucket_arn" { + description = "ARN of the S3 bucket containing documents for the knowledge base. If specified, a data source will be created." + type = string + default = null +} + +variable "data_source_name" { + description = "Name of the data source. If not specified, defaults to '-s3-data-source'." + type = string + default = null +} + +variable "data_source_description" { + description = "Description of the data source." + type = string + default = null +} + +variable "s3_bucket_owner_account_id" { + description = "AWS account ID of the S3 bucket owner. Required if the bucket is in a different account." + type = string + default = null +} + +variable "s3_inclusion_prefixes" { + description = "List of S3 prefixes to include when ingesting documents. Use this to limit which files are indexed." + type = list(string) + default = [] +} + +variable "data_deletion_policy" { + description = "Policy for deleting data from the knowledge base when the data source is deleted. Valid values: 'RETAIN', 'DELETE'." + type = string + default = "RETAIN" + + validation { + condition = contains(["RETAIN", "DELETE"], var.data_deletion_policy) + error_message = "Data deletion policy must be either 'RETAIN' or 'DELETE'." + } +} + +# ----------------------------------------------------------------------------- +# Chunking Configuration +# ----------------------------------------------------------------------------- + +variable "chunking_strategy" { + description = "Strategy for chunking documents. Valid values: 'FIXED_SIZE', 'NONE'. FIXED_SIZE splits documents into chunks, NONE keeps documents whole." + type = string + default = "FIXED_SIZE" + + validation { + condition = contains(["FIXED_SIZE", "NONE"], var.chunking_strategy) + error_message = "Chunking strategy must be either 'FIXED_SIZE' or 'NONE'." + } +} + +variable "fixed_size_max_tokens" { + description = "Maximum number of tokens per chunk when using FIXED_SIZE chunking strategy. Valid range: 20-8192." + type = number + default = 300 + + validation { + condition = var.fixed_size_max_tokens >= 20 && var.fixed_size_max_tokens <= 8192 + error_message = "Max tokens must be between 20 and 8192." + } +} + +variable "fixed_size_overlap_percentage" { + description = "Percentage of overlap between consecutive chunks. Valid range: 1-99. Higher values preserve more context but increase storage." + type = number + default = 20 + + validation { + condition = var.fixed_size_overlap_percentage >= 1 && var.fixed_size_overlap_percentage <= 99 + error_message = "Overlap percentage must be between 1 and 99." + } +} + +# ----------------------------------------------------------------------------- +# General Variables +# ----------------------------------------------------------------------------- + +variable "tags" { + description = "A map of tags to add to all resources. Use this for cost allocation, resource organization, and governance." + type = map(string) + default = {} +}