Skip to content

Commit

Permalink
fix: fix etl endpoint package
Browse files Browse the repository at this point in the history
  • Loading branch information
IcyKallen committed Feb 28, 2025
1 parent 1f3d55d commit 96e97b7
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 27 deletions.
2 changes: 1 addition & 1 deletion source/infrastructure/lib/model/model-construct.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ export class ModelConstruct extends NestedStack implements ModelConstructOutputs

private deployKnowledgeBaseEndpoint(props: ModelConstructProps) {
// Deploy Knowledge Base model
let knowledgeBaseModelName = "knowledge-base-model-20250228";
let knowledgeBaseModelName = "knowledge-base-model-0228";
let knowledgeBaseModelEcrRepository = props.config.knowledgeBase.knowledgeBaseType.intelliAgentKb.knowledgeBaseModel.ecrRepository;
let knowledgeBaseModelEcrImageTag = props.config.knowledgeBase.knowledgeBaseType.intelliAgentKb.knowledgeBaseModel.ecrImageTag;
let knowledgeBaseModelImageUrl = this.modelAccount + ".dkr.ecr." + this.modelRegion + this.modelImageUrlDomain + knowledgeBaseModelEcrRepository + ":" + knowledgeBaseModelEcrImageTag;
Expand Down
Binary file modified source/lambda/job/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl
Binary file not shown.
65 changes: 41 additions & 24 deletions source/lambda/job/dep/llm_bot_dep/loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,23 @@ def invoke_etl_model(
portal_bucket_name: str,
mode: str = "ppstructure",
lang: str = "zh",
model_provider: str = "claude",
model_id: str = "anthropic/claude-3-5-sonnet-20240620",
api_secret_name: str = None,
api_url: str = None,
):
# Create a dictionary with all parameters
json_data = {
"s3_bucket": bucket,
"object_key": key,
"destination_bucket": res_bucket,
"portal_bucket": portal_bucket_name,
"mode": mode,
"lang": lang,
"model_provider": model_provider,
"model_id": model_id,
"api_secret_name": api_secret_name,
"api_url": api_url,
}

file_name = f"data_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}_{uuid.uuid4().hex}.json"
Expand Down Expand Up @@ -161,35 +170,43 @@ def process_pdf(s3, pdf: bytes, **kwargs):
doc.metadata["file_path"] = f"s3://{bucket}/{key}"
doc.metadata["file_type"] = "pdf"
else:
# Extract common parameters for ETL model invocation
etl_params = {
"s3_client": s3,
"smr_client": smr_client,
"etl_model_endpoint": etl_model_endpoint,
"bucket": bucket,
"key": key,
"res_bucket": res_bucket,
"portal_bucket_name": portal_bucket_name,
"mode": "ppstructure",
"model_provider": kwargs.get("model_provider"),
"model_id": kwargs.get("model_id"),
"api_secret_name": kwargs.get("api_secret_name"),
"api_url": kwargs.get("api_url"),
}

if document_language == "zh":
logger.info("Detected language is Chinese, using default PDF loader...")
markdown_prefix = invoke_etl_model(
s3,
smr_client,
etl_model_endpoint,
bucket,
key,
res_bucket,
portal_bucket_name,
mode="ppstructure",
lang="zh",
logger.info(
"Detected language is Chinese, using default PDF loader..."
)
# Only specify the language parameter that differs
etl_params["lang"] = "zh"
markdown_prefix = invoke_etl_model(**etl_params)
logger.info(
f"Markdown file path: s3://{res_bucket}/{markdown_prefix}"
)
logger.info(f"Markdown file path: s3://{res_bucket}/{markdown_prefix}")
content = load_content_from_s3(s3, res_bucket, markdown_prefix)
else:
logger.info("Detected language is English, using ETL model endpoint...")
markdown_prefix = invoke_etl_model(
s3,
smr_client,
etl_model_endpoint,
bucket,
key,
res_bucket,
portal_bucket_name,
mode="ppstructure",
lang="en",
logger.info(
"Detected language is English, using ETL model endpoint..."
)
# Only specify the language parameter that differs
etl_params["lang"] = "en"
markdown_prefix = invoke_etl_model(**etl_params)
logger.info(
f"Markdown file path: s3://{res_bucket}/{markdown_prefix}"
)
logger.info(f"Markdown file path: s3://{res_bucket}/{markdown_prefix}")
content = load_content_from_s3(s3, res_bucket, markdown_prefix)

# Remove duplicate sections
Expand Down
2 changes: 1 addition & 1 deletion source/model/etl/code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def get_api_key(api_secret_name):
# Parse the secret JSON
if "SecretString" in secret_response:
secret_data = json.loads(secret_response["SecretString"])
api_key = secret_data.get("api_key")
api_key = secret_data.get("key")
logger.info(
f"Successfully retrieved API key from secret: {api_secret_name}"
)
Expand Down
3 changes: 2 additions & 1 deletion source/model/etl/code/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ PyMuPDF<1.21.0
markdownify
flask
gevent
GPUtil
GPUtil
requests

0 comments on commit 96e97b7

Please sign in to comment.