@@ -343,6 +343,13 @@ class CLIPTokenizer {
343
343
}
344
344
}
345
345
346
+ std::string clean_up_tokenization (std::string& text) {
347
+ std::regex pattern (R"( ,)" );
348
+ // Replace " ," with ","
349
+ std::string result = std::regex_replace (text, pattern, " ," );
350
+ return result;
351
+ }
352
+
346
353
std::string decode (const std::vector<int >& tokens) {
347
354
std::string text = " " ;
348
355
for (int t : tokens) {
@@ -351,8 +358,12 @@ class CLIPTokenizer {
351
358
std::u32string ts = decoder[t];
352
359
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
353
360
std::string s = utf32_to_utf8 (ts);
354
- if (s.length () >= 4 && ends_with (s, " </w>" )) {
355
- text += " " + s.replace (s.length () - 4 , s.length () - 1 , " " );
361
+ if (s.length () >= 4 ) {
362
+ if (ends_with (s, " </w>" )) {
363
+ text += s.replace (s.length () - 4 , s.length () - 1 , " " ) + " " ;
364
+ } else {
365
+ text += s;
366
+ }
356
367
} else {
357
368
text += " " + s;
358
369
}
@@ -364,6 +375,7 @@ class CLIPTokenizer {
364
375
365
376
// std::string s((char *)bytes.data());
366
377
// std::string s = "";
378
+ text = clean_up_tokenization (text);
367
379
return trim (text);
368
380
}
369
381
@@ -533,9 +545,12 @@ class CLIPEmbeddings : public GGMLBlock {
533
545
int64_t vocab_size;
534
546
int64_t num_positions;
535
547
536
- void init_params (struct ggml_context * ctx, ggml_type wtype) {
537
- params[" token_embedding.weight" ] = ggml_new_tensor_2d (ctx, wtype, embed_dim, vocab_size);
538
- params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, embed_dim, num_positions);
548
+ void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
549
+ enum ggml_type token_wtype = (tensor_types.find (prefix + " token_embedding.weight" ) != tensor_types.end ()) ? tensor_types[prefix + " token_embedding.weight" ] : GGML_TYPE_F32;
550
+ enum ggml_type position_wtype = GGML_TYPE_F32; // (tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
551
+
552
+ params[" token_embedding.weight" ] = ggml_new_tensor_2d (ctx, token_wtype, embed_dim, vocab_size);
553
+ params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, position_wtype, embed_dim, num_positions);
539
554
}
540
555
541
556
public:
@@ -579,11 +594,14 @@ class CLIPVisionEmbeddings : public GGMLBlock {
579
594
int64_t image_size;
580
595
int64_t num_patches;
581
596
int64_t num_positions;
597
+ void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
598
+ enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
599
+ enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
600
+ enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
582
601
583
- void init_params (struct ggml_context * ctx, ggml_type wtype) {
584
- params[" patch_embedding.weight" ] = ggml_new_tensor_4d (ctx, GGML_TYPE_F16, patch_size, patch_size, num_channels, embed_dim);
585
- params[" class_embedding" ] = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, embed_dim);
586
- params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, embed_dim, num_positions);
602
+ params[" patch_embedding.weight" ] = ggml_new_tensor_4d (ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
603
+ params[" class_embedding" ] = ggml_new_tensor_1d (ctx, class_wtype, embed_dim);
604
+ params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, position_wtype, embed_dim, num_positions);
587
605
}
588
606
589
607
public:
@@ -639,9 +657,10 @@ enum CLIPVersion {
639
657
640
658
class CLIPTextModel : public GGMLBlock {
641
659
protected:
642
- void init_params (struct ggml_context * ctx, ggml_type wtype ) {
660
+ void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
643
661
if (version == OPEN_CLIP_VIT_BIGG_14) {
644
- params[" text_projection" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, projection_dim, hidden_size);
662
+ enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
663
+ params[" text_projection" ] = ggml_new_tensor_2d (ctx, wtype, projection_dim, hidden_size);
645
664
}
646
665
}
647
666
@@ -711,8 +730,12 @@ class CLIPTextModel : public GGMLBlock {
711
730
if (return_pooled) {
712
731
auto text_projection = params[" text_projection" ];
713
732
ggml_tensor* pooled = ggml_view_1d (ctx, x, hidden_size, x->nb [1 ] * max_token_idx);
714
- pooled = ggml_mul_mat (ctx, ggml_cont (ctx, ggml_transpose (ctx, text_projection)), pooled);
715
- return pooled;
733
+ if (text_projection != NULL ) {
734
+ pooled = ggml_nn_linear (ctx, pooled, text_projection, NULL );
735
+ } else {
736
+ LOG_DEBUG (" Missing text_projection matrix, assuming identity..." );
737
+ }
738
+ return pooled; // [hidden_size, 1, 1]
716
739
}
717
740
718
741
return x; // [N, n_token, hidden_size]
@@ -761,14 +784,17 @@ class CLIPVisionModel : public GGMLBlock {
761
784
auto x = embeddings->forward (ctx, pixel_values); // [N, num_positions, embed_dim]
762
785
x = pre_layernorm->forward (ctx, x);
763
786
x = encoder->forward (ctx, x, -1 , false );
764
- x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
787
+ // print_ggml_tensor(x, true, "ClipVisionModel x: ");
788
+ auto last_hidden_state = x;
789
+ x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
765
790
766
791
GGML_ASSERT (x->ne [3 ] == 1 );
767
792
if (return_pooled) {
768
793
ggml_tensor* pooled = ggml_cont (ctx, ggml_view_2d (ctx, x, x->ne [0 ], x->ne [2 ], x->nb [2 ], 0 ));
769
794
return pooled; // [N, hidden_size]
770
795
} else {
771
- return x; // [N, n_token, hidden_size]
796
+ // return x; // [N, n_token, hidden_size]
797
+ return last_hidden_state; // [N, n_token, hidden_size]
772
798
}
773
799
}
774
800
};
@@ -779,9 +805,9 @@ class CLIPProjection : public UnaryBlock {
779
805
int64_t out_features;
780
806
bool transpose_weight;
781
807
782
- void init_params (struct ggml_context * ctx, ggml_type wtype) {
808
+ void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
809
+ enum ggml_type wtype = tensor_types.find (prefix + " weight" ) != tensor_types.end () ? tensor_types[prefix + " weight" ] : GGML_TYPE_F32;
783
810
if (transpose_weight) {
784
- LOG_ERROR (" transpose_weight" );
785
811
params[" weight" ] = ggml_new_tensor_2d (ctx, wtype, out_features, in_features);
786
812
} else {
787
813
params[" weight" ] = ggml_new_tensor_2d (ctx, wtype, in_features, out_features);
@@ -842,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
842
868
CLIPTextModel model;
843
869
844
870
CLIPTextModelRunner (ggml_backend_t backend,
845
- ggml_type wtype,
871
+ std::map<std::string, enum ggml_type>& tensor_types,
872
+ const std::string prefix,
846
873
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
847
874
int clip_skip_value = 1 ,
848
875
bool with_final_ln = true )
849
- : GGMLRunner(backend, wtype ), model(version, clip_skip_value, with_final_ln) {
850
- model.init (params_ctx, wtype );
876
+ : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
877
+ model.init (params_ctx, tensor_types, prefix );
851
878
}
852
879
853
880
std::string get_desc () {
@@ -889,13 +916,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
889
916
struct ggml_tensor * embeddings = NULL ;
890
917
891
918
if (num_custom_embeddings > 0 && custom_embeddings_data != NULL ) {
892
- auto custom_embeddings = ggml_new_tensor_2d (compute_ctx,
893
- wtype,
894
- model.hidden_size ,
895
- num_custom_embeddings);
919
+ auto token_embed_weight = model.get_token_embed_weight ();
920
+ auto custom_embeddings = ggml_new_tensor_2d (compute_ctx,
921
+ token_embed_weight->type ,
922
+ model.hidden_size ,
923
+ num_custom_embeddings);
896
924
set_backend_tensor_data (custom_embeddings, custom_embeddings_data);
897
925
898
- auto token_embed_weight = model.get_token_embed_weight ();
899
926
// concatenate custom embeddings
900
927
embeddings = ggml_concat (compute_ctx, token_embed_weight, custom_embeddings, 1 );
901
928
}
0 commit comments