11#ifndef __ANIMA_HPP__
22#define __ANIMA_HPP__
33
4+ #include < algorithm>
45#include < cmath>
56#include < memory>
67#include < utility>
1415namespace Anima {
1516 constexpr int ANIMA_GRAPH_SIZE = 65536 ;
1617
18+ struct AnimaConfig {
19+ int64_t in_channels = 16 ;
20+ int64_t out_channels = 16 ;
21+ int64_t hidden_size = 2048 ;
22+ int64_t text_embed_dim = 1024 ;
23+ int64_t num_heads = 16 ;
24+ int64_t head_dim = 128 ;
25+ int patch_size = 2 ;
26+ int64_t num_layers = 28 ;
27+ std::vector<int > axes_dim = {44 , 42 , 42 };
28+ int theta = 10000 ;
29+
30+ static AnimaConfig detect_from_weights (const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
31+ AnimaConfig config;
32+ int64_t detected_layers = 0 ;
33+ std::string layer_tag = prefix.empty () ? " blocks." : prefix + " .blocks." ;
34+ for (const auto & [name, _] : tensor_storage_map) {
35+ size_t pos = name.find (layer_tag);
36+ if (pos == std::string::npos) {
37+ continue ;
38+ }
39+ size_t start = pos + layer_tag.size ();
40+ size_t end = name.find (' .' , start);
41+ if (end == std::string::npos) {
42+ continue ;
43+ }
44+ int64_t layer_id = atoll (name.substr (start, end - start).c_str ());
45+ detected_layers = std::max (detected_layers, layer_id + 1 );
46+ }
47+ if (detected_layers > 0 ) {
48+ config.num_layers = detected_layers;
49+ LOG_DEBUG (" anima: num_layers = %" PRId64 " , hidden_size = %" PRId64 " , num_heads = %" PRId64 " , head_dim = %" PRId64,
50+ config.num_layers ,
51+ config.hidden_size ,
52+ config.num_heads ,
53+ config.head_dim );
54+ }
55+ return config;
56+ }
57+ };
58+
1759 __STATIC_INLINE__ ggml_tensor* apply_gate (ggml_context* ctx,
1860 ggml_tensor* x,
1961 ggml_tensor* gate) {
@@ -418,31 +460,22 @@ namespace Anima {
418460
419461 struct AnimaNet : public GGMLBlock {
420462 public:
421- int64_t in_channels = 16 ;
422- int64_t out_channels = 16 ;
423- int64_t hidden_size = 2048 ;
424- int64_t text_embed_dim = 1024 ;
425- int64_t num_heads = 16 ;
426- int64_t head_dim = 128 ;
427- int patch_size = 2 ;
428- int64_t num_layers = 28 ;
429- std::vector<int > axes_dim = {44 , 42 , 42 };
430- int theta = 10000 ;
463+ AnimaConfig config;
431464
432465 public:
433466 AnimaNet () = default ;
434- explicit AnimaNet (int64_t num_layers )
435- : num_layers(num_layers ) {
436- blocks[" x_embedder" ] = std::make_shared<XEmbedder>((in_channels + 1 ) * patch_size * patch_size, hidden_size);
437- blocks[" t_embedder" ] = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3 );
438- blocks[" t_embedding_norm" ] = std::make_shared<RMSNorm>(hidden_size, 1e-6f );
439- for (int i = 0 ; i < num_layers; i++) {
440- blocks[" blocks." + std::to_string (i)] = std::make_shared<TransformerBlock>(hidden_size,
441- text_embed_dim,
442- num_heads,
443- head_dim);
467+ explicit AnimaNet (AnimaConfig config )
468+ : config(config ) {
469+ blocks[" x_embedder" ] = std::make_shared<XEmbedder>((config. in_channels + 1 ) * config. patch_size * config. patch_size , config. hidden_size );
470+ blocks[" t_embedder" ] = std::make_shared<TimestepEmbedder>(config. hidden_size , config. hidden_size * 3 );
471+ blocks[" t_embedding_norm" ] = std::make_shared<RMSNorm>(config. hidden_size , 1e-6f );
472+ for (int i = 0 ; i < config. num_layers ; i++) {
473+ blocks[" blocks." + std::to_string (i)] = std::make_shared<TransformerBlock>(config. hidden_size ,
474+ config. text_embed_dim ,
475+ config. num_heads ,
476+ config. head_dim );
444477 }
445- blocks[" final_layer" ] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
478+ blocks[" final_layer" ] = std::make_shared<FinalLayer>(config. hidden_size , config. patch_size , config. out_channels );
446479 blocks[" llm_adapter" ] = std::make_shared<LLMAdapter>(1024 , 1024 , 1024 , 6 , 16 );
447480 }
448481
@@ -469,11 +502,11 @@ namespace Anima {
469502 auto padding_mask = ggml_ext_zeros (ctx->ggml_ctx , x->ne [0 ], x->ne [1 ], 1 , x->ne [3 ]);
470503 x = ggml_concat (ctx->ggml_ctx , x, padding_mask, 2 ); // [N, C + 1, H, W]
471504
472- x = DiT::pad_and_patchify (ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
505+ x = DiT::pad_and_patchify (ctx, x, config. patch_size , config. patch_size ); // [N, h*w, (C+1)*ph*pw]
473506
474507 x = x_embedder->forward (ctx, x);
475508
476- auto timestep_proj = ggml_ext_timestep_embedding (ctx->ggml_ctx , timestep, static_cast <int >(hidden_size));
509+ auto timestep_proj = ggml_ext_timestep_embedding (ctx->ggml_ctx , timestep, static_cast <int >(config. hidden_size ));
477510 auto temb = t_embedder->forward (ctx, timestep_proj);
478511 auto embedded_timestep = t_embedding_norm->forward (ctx, timestep_proj);
479512
@@ -505,15 +538,15 @@ namespace Anima {
505538 sd::ggml_graph_cut::mark_graph_cut (temb, " anima.prelude" , " temb" );
506539 sd::ggml_graph_cut::mark_graph_cut (encoder_hidden_states, " anima.prelude" , " context" );
507540
508- for (int i = 0 ; i < num_layers; i++) {
541+ for (int i = 0 ; i < config. num_layers ; i++) {
509542 auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks[" blocks." + std::to_string (i)]);
510543 x = block->forward (ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
511544 sd::ggml_graph_cut::mark_graph_cut (x, " anima.blocks." + std::to_string (i), " x" );
512545 }
513546
514547 x = final_layer->forward (ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
515548
516- x = DiT::unpatchify_and_crop (ctx->ggml_ctx , x, H, W, patch_size, patch_size, false ); // [N, C, H, W]
549+ x = DiT::unpatchify_and_crop (ctx->ggml_ctx , x, H, W, config. patch_size , config. patch_size , false ); // [N, C, H, W]
517550
518551 return x;
519552 }
@@ -524,35 +557,16 @@ namespace Anima {
524557 std::vector<float > image_pe_vec;
525558 std::vector<float > adapter_q_pe_vec;
526559 std::vector<float > adapter_k_pe_vec;
560+ AnimaConfig config;
527561 AnimaNet net;
528562
529563 AnimaRunner (ggml_backend_t backend,
530564 ggml_backend_t params_backend,
531565 const String2TensorStorage& tensor_storage_map = {},
532566 const std::string prefix = " model.diffusion_model" )
533- : DiffusionModelRunner(backend, params_backend, prefix) {
534- int64_t num_layers = 0 ;
535- std::string layer_tag = prefix + " .net.blocks." ;
536- for (const auto & kv : tensor_storage_map) {
537- const std::string& tensor_name = kv.first ;
538- size_t pos = tensor_name.find (layer_tag);
539- if (pos == std::string::npos) {
540- continue ;
541- }
542- size_t start = pos + layer_tag.size ();
543- size_t end = tensor_name.find (' .' , start);
544- if (end == std::string::npos) {
545- continue ;
546- }
547- int64_t layer_id = atoll (tensor_name.substr (start, end - start).c_str ());
548- num_layers = std::max (num_layers, layer_id + 1 );
549- }
550- if (num_layers <= 0 ) {
551- num_layers = 28 ;
552- }
553- LOG_INFO (" anima net layers: %" PRId64, num_layers);
554-
555- net = AnimaNet (num_layers);
567+ : DiffusionModelRunner(backend, params_backend, prefix),
568+ config (AnimaConfig::detect_from_weights(tensor_storage_map, prefix + " .net" )) {
569+ net = AnimaNet (config);
556570 net.init (params_ctx, tensor_storage_map, prefix + " .net" );
557571 }
558572
@@ -623,22 +637,22 @@ namespace Anima {
623637 GGML_ASSERT (x->ne [3 ] == 1 );
624638 ggml_cgraph* gf = new_graph_custom (ANIMA_GRAPH_SIZE );
625639
626- int64_t pad_h = (net .patch_size - x->ne [1 ] % net .patch_size ) % net .patch_size ;
627- int64_t pad_w = (net .patch_size - x->ne [0 ] % net .patch_size ) % net .patch_size ;
640+ int64_t pad_h = (config .patch_size - x->ne [1 ] % config .patch_size ) % config .patch_size ;
641+ int64_t pad_w = (config .patch_size - x->ne [0 ] % config .patch_size ) % config .patch_size ;
628642 int64_t h_pad = x->ne [1 ] + pad_h;
629643 int64_t w_pad = x->ne [0 ] + pad_w;
630644
631645 image_pe_vec = gen_anima_image_pe_vec (1 ,
632646 static_cast <int >(h_pad),
633647 static_cast <int >(w_pad),
634- static_cast <int >(net .patch_size ),
635- net .theta ,
636- net .axes_dim ,
648+ static_cast <int >(config .patch_size ),
649+ config .theta ,
650+ config .axes_dim ,
637651 4 .0f ,
638652 4 .0f ,
639653 1 .0f );
640- int64_t image_pos_len = static_cast <int64_t >(image_pe_vec.size ()) / (2 * 2 * (net .head_dim / 2 ));
641- auto image_pe = ggml_new_tensor_4d (compute_ctx, GGML_TYPE_F32 , 2 , 2 , net .head_dim / 2 , image_pos_len);
654+ int64_t image_pos_len = static_cast <int64_t >(image_pe_vec.size ()) / (2 * 2 * (config .head_dim / 2 ));
655+ auto image_pe = ggml_new_tensor_4d (compute_ctx, GGML_TYPE_F32 , 2 , 2 , config .head_dim / 2 , image_pos_len);
642656 set_backend_tensor_data (image_pe, image_pe_vec.data ());
643657
644658 ggml_tensor* adapter_q_pe = nullptr ;
0 commit comments