37
37
38
38
39
39
def text_manipulate (
40
- all_document_for_process , file_name , support_type , conn_pool , create_user
40
+ all_document_for_process ,
41
+ file_name ,
42
+ support_type ,
43
+ conn_pool ,
44
+ create_user ,
45
+ progress = 0
41
46
):
42
47
"""Manipulate the text content.
43
48
@@ -63,7 +68,7 @@ def text_manipulate(
63
68
conn_pool = conn_pool ,
64
69
)
65
70
66
- text_process_success_num = 0
71
+ text_process_success_num = progress
67
72
for document in all_document_for_process :
68
73
document_chunk_id = document .get ("id" )
69
74
# Clean the data such as removing invisible characters.
@@ -116,11 +121,6 @@ def text_manipulate(
116
121
if qa_response .get ("status" ) != 200 :
117
122
return qa_response
118
123
119
- # 文件处理成功,更新data_process_task_document中的文件状态
120
- _updata_document_status_and_end_time (
121
- id = document_id , status = "success" , conn_pool = conn_pool
122
- )
123
-
124
124
if support_type_map .get ("qa_split" ):
125
125
# 是否选择了QA拆分
126
126
qa_list_dict = support_type_map .get ("qa_split" )
@@ -196,6 +196,13 @@ def text_manipulate(
196
196
file_name = file_name_csv , phase_value = "final" , data = qa_data_dict
197
197
)
198
198
199
+ _update_document_status_and_progress (
200
+ id = document_id ,
201
+ status = "success" ,
202
+ progress = 100 ,
203
+ conn_pool = conn_pool
204
+ )
205
+
199
206
logger .debug (f"{ log_tag_const .COMMON_HANDLE } Finish manipulating the text" )
200
207
return {
201
208
"status" : 200 ,
@@ -225,13 +232,25 @@ def text_manipulate(
225
232
file_name = file_name_csv , phase_value = "final" , data = chunk_data_dict
226
233
)
227
234
235
+ _update_document_status_and_progress (
236
+ id = document_id ,
237
+ status = "success" ,
238
+ progress = 100 ,
239
+ conn_pool = conn_pool
240
+ )
241
+
228
242
logger .debug (f"{ log_tag_const .COMMON_HANDLE } Finish manipulating the text" )
229
243
return {
230
244
"status" : 200 ,
231
245
"message" : "" ,
232
246
"data" : "" ,
233
247
}
234
248
249
+ # 文件处理成功,更新data_process_task_document中的文件状态
250
+ _updata_document_status_and_end_time (
251
+ id = document_id , status = "success" , conn_pool = conn_pool
252
+ )
253
+
235
254
return {"status" : 200 , "message" : "" , "data" : "" }
236
255
except Exception as ex :
237
256
logger .error (
@@ -914,6 +933,7 @@ def _qa_split(
914
933
):
915
934
qa_list_dict = support_type_map .get ("qa_split" )
916
935
llm_config = qa_list_dict .get ("llm_config" )
936
+ remove_duplicate_config = qa_list_dict .get ("remove_duplicate_config" )
917
937
918
938
# 更新chunk状态为开始
919
939
_update_document_chunk_status_and_start_time (
@@ -937,6 +957,7 @@ def _qa_split(
937
957
id = document_id , status = "fail" , conn_pool = conn_pool
938
958
)
939
959
else :
960
+ qa_list = []
940
961
# 将QA数据存入表中
941
962
qa_data = qa_response .get ("data" )
942
963
for _ , item in enumerate (qa_data ):
@@ -955,6 +976,34 @@ def _qa_split(
955
976
qa_insert_item , pool = conn_pool
956
977
)
957
978
979
+ qa_list .append (qa_insert_item )
980
+
981
+ # 是否需要进行去重
982
+ if remove_duplicate_config :
983
+ for qa in qa_list :
984
+ embedding_response = _embedding_qa (
985
+ qa_list = [qa ],
986
+ remove_duplicate_config = remove_duplicate_config ,
987
+ conn_pool = conn_pool
988
+ )
989
+
990
+ if embedding_response .get ("status" ) != 200 :
991
+ # 处理失败
992
+ # 更新data_process_task_document_chunk中的状态
993
+ _updata_document_chunk_status_and_end_time (
994
+ id = document_chunk_id ,
995
+ update_user = create_user ,
996
+ status = "fail" ,
997
+ conn_pool = conn_pool ,
998
+ )
999
+
1000
+ # 更新data_process_task_document中的文件状态
1001
+ _updata_document_status_and_end_time (
1002
+ id = document_id , status = "fail" , conn_pool = conn_pool
1003
+ )
1004
+
1005
+ return embedding_response
1006
+
958
1007
# 更新data_process_task_document_chunk中的状态
959
1008
_updata_document_chunk_status_and_end_time (
960
1009
id = document_chunk_id ,
@@ -965,6 +1014,9 @@ def _qa_split(
965
1014
966
1015
# 更新文件处理进度
967
1016
progress = int (text_process_success_num / document_chunk_size * 100 )
1017
+ if text_process_success_num == document_chunk_size :
1018
+ progress = 99
1019
+
968
1020
_updata_document_progress (
969
1021
id = document_id ,
970
1022
progress = progress ,
@@ -994,7 +1046,7 @@ def _generate_qa_list(content, llm_config):
994
1046
995
1047
# Generate the QA list.
996
1048
qa_list = []
997
- if llm_spec_info .get ("data" ). get ( " provider" ). get ( "worker" ) :
1049
+ if llm_config .get ("provider" ) == "worker" :
998
1050
# get base url for configmap
999
1051
base_url = model_cr .get_worker_base_url_k8s_configmap (
1000
1052
name = config .k8s_default_config , namespace = config .k8s_pod_namespace
@@ -1190,6 +1242,26 @@ def _updata_document_progress(id, progress, update_user, conn_pool):
1190
1242
return {"status" : 1000 , "message" : str (ex ), "data" : traceback .format_exc ()}
1191
1243
1192
1244
1245
+ def _update_document_status_and_progress (id , status , progress , conn_pool ):
1246
+ try :
1247
+ document_update_item = {"id" : id , "status" : status , "progress" : progress }
1248
+ data_process_document_db_operate .update_document_status_and_progress (
1249
+ document_update_item , pool = conn_pool
1250
+ )
1251
+
1252
+ return {"status" : 200 , "message" : "" , "data" : "" }
1253
+ except Exception as ex :
1254
+ logger .error (
1255
+ "" .join (
1256
+ [
1257
+ f"{ log_tag_const .COMMON_HANDLE } update document status " ,
1258
+ f"\n { traceback .format_exc ()} " ,
1259
+ ]
1260
+ )
1261
+ )
1262
+ return {"status" : 1000 , "message" : str (ex ), "data" : traceback .format_exc ()}
1263
+
1264
+
1193
1265
def _update_document_chunk_status_and_start_time (id , update_user , conn_pool ):
1194
1266
try :
1195
1267
now = date_time_utils .now_str ()
@@ -1292,8 +1364,8 @@ def _qa_remove_duplicate(qa_list, remove_duplicate_config, conn_pool):
1292
1364
provider = remove_duplicate_config .get ("embedding_provider" )
1293
1365
similarity = float (remove_duplicate_config .get ("similarity" ))
1294
1366
1295
- # llms cr 中模型相关信息
1296
- llm_spec_info = model_cr .get_spec_for_embedding_k8s_cr (name = name , namespace = namespace )
1367
+ # embedding cr 中模型相关信息
1368
+ embedding_spec_info = model_cr .get_spec_for_embedding_k8s_cr (name = name , namespace = namespace )
1297
1369
1298
1370
if provider == "worker" :
1299
1371
# get base url for configmap
@@ -1319,11 +1391,11 @@ def _qa_remove_duplicate(qa_list, remove_duplicate_config, conn_pool):
1319
1391
)
1320
1392
1321
1393
remove_duplicate_loader = QARemoveDuplicate (embeddings = qa_embeddings , pool = conn_pool )
1322
- return remove_duplicate_loader .qa_remove_duplicate (qa_list , similarity )
1394
+ return remove_duplicate_loader .remove_duplicate_qa_data (qa_list , similarity )
1323
1395
else :
1324
- endpoint = llm_spec_info .get ("data" ).get ("provider" ).get ("endpoint" )
1396
+ endpoint = embedding_spec_info .get ("data" ).get ("provider" ).get ("endpoint" )
1325
1397
base_url = endpoint .get ("url" )
1326
- llm_type = llm_spec_info .get ("data" ).get ("type" )
1398
+ embedding_type = embedding_spec_info .get ("data" ).get ("type" )
1327
1399
1328
1400
logger .debug (
1329
1401
"" .join (
@@ -1332,19 +1404,83 @@ def _qa_remove_duplicate(qa_list, remove_duplicate_config, conn_pool):
1332
1404
f"name: { name } \n " ,
1333
1405
f"namespace: { namespace } \n " ,
1334
1406
f"model: { model } \n " ,
1335
- f"llm_type: { llm_type } \n " ,
1407
+ f"embedding_type: { embedding_type } \n " ,
1408
+ ]
1409
+ )
1410
+ )
1411
+
1412
+ if embedding_type == "openai" :
1413
+ qa_embeddings = OpenAIEmbeddings (
1414
+ api_key = "fake" ,
1415
+ base_url = base_url ,
1416
+ model = model ,
1417
+ )
1418
+
1419
+ remove_duplicate_loader = QARemoveDuplicate (embeddings = qa_embeddings , pool = conn_pool )
1420
+ return remove_duplicate_loader .remove_duplicate_qa_data (qa_list , similarity )
1421
+ else :
1422
+ return {"status" : 1000 , "message" : f"暂时不支持{ embedding_type } 类型的向量化模型模型" , "data" : "" }
1423
+
1424
+
1425
+ def _embedding_qa (qa_list , remove_duplicate_config , conn_pool ):
1426
+ name = remove_duplicate_config .get ("embedding_name" )
1427
+ namespace = remove_duplicate_config .get ("embedding_namespace" )
1428
+ model = remove_duplicate_config .get ("embedding_model" )
1429
+ provider = remove_duplicate_config .get ("embedding_provider" )
1430
+
1431
+ # embeddings cr 中模型相关信息
1432
+ embedding_spec_info = model_cr .get_spec_for_embedding_k8s_cr (name = name , namespace = namespace )
1433
+
1434
+ if provider == "worker" :
1435
+ # get base url for configmap
1436
+ base_url = model_cr .get_worker_base_url_k8s_configmap (
1437
+ name = config .k8s_default_config , namespace = config .k8s_pod_namespace
1438
+ )
1439
+ logger .debug (
1440
+ "" .join (
1441
+ [
1442
+ f"worker embedding \n " ,
1443
+ f"name: { name } \n " ,
1444
+ f"namespace: { namespace } \n " ,
1445
+ f"model: { model } \n " ,
1446
+ f"base_url: { base_url } \n " ,
1447
+ ]
1448
+ )
1449
+ )
1450
+
1451
+ qa_embeddings = OpenAIEmbeddings (
1452
+ api_key = "fake" ,
1453
+ base_url = base_url ,
1454
+ model = model ,
1455
+ )
1456
+
1457
+ remove_duplicate_loader = QARemoveDuplicate (embeddings = qa_embeddings , pool = conn_pool )
1458
+ return remove_duplicate_loader .embedding_qa_data (qa_list )
1459
+ else :
1460
+ endpoint = embedding_spec_info .get ("data" ).get ("provider" ).get ("endpoint" )
1461
+ base_url = endpoint .get ("url" )
1462
+ embedding_type = embedding_spec_info .get ("data" ).get ("type" )
1463
+
1464
+ logger .debug (
1465
+ "" .join (
1466
+ [
1467
+ f"3rd_party embedding \n " ,
1468
+ f"name: { name } \n " ,
1469
+ f"namespace: { namespace } \n " ,
1470
+ f"model: { model } \n " ,
1471
+ f"embedding_type: { embedding_type } \n " ,
1336
1472
]
1337
1473
)
1338
1474
)
1339
1475
1340
- if llm_type == "openai" :
1476
+ if embedding_type == "openai" :
1341
1477
qa_embeddings = OpenAIEmbeddings (
1342
1478
api_key = "fake" ,
1343
1479
base_url = base_url ,
1344
1480
model = model ,
1345
1481
)
1346
1482
1347
1483
remove_duplicate_loader = QARemoveDuplicate (embeddings = qa_embeddings , pool = conn_pool )
1348
- return remove_duplicate_loader .qa_remove_duplicate (qa_list , similarity )
1484
+ return remove_duplicate_loader .embedding_qa_data (qa_list )
1349
1485
else :
1350
- return {"status" : 1000 , "message" : f"暂时不支持{ llm_type } 类型的向量化模型模型" , "data" : "" }
1486
+ return {"status" : 1000 , "message" : f"暂时不支持{ embedding_type } 类型的向量化模型模型" , "data" : "" }
0 commit comments