[update] release infinity-8b

hanjian.thu123 · hanjian.thu123 · commit 1b78f1b1cbe8 · 2025-02-18T17:13:46.000+08:00
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@
 <p>
 
 ## 🔥 Updates!!
+* Feb 18, 2025: 🔥 Infinity-8B Weights & Code is released!
 * Feb 7, 2025: 🌺 Infinity-8B Demo is released! Check [demo](https://opensource.bytedance.com/gmpt/t2i/invite).
 * Dec 24, 2024: 🔥 Training and Testing Codes && Checkpoints && Demo released!
 * Dec 12, 2024: 💻 Add Project Page
@@ -30,10 +31,11 @@
 
 We provide a [demo website](https://opensource.bytedance.com/gmpt/t2i/invite) for you to play with Infinity and generate images interactively. Enjoy the fun of bitwise autoregressive modeling!
 
-We also provide [interactive_infer.ipynb](tools/interactive_infer.ipynb) for you to see more technical details about Infinity.
+We also provide [interactive_infer.ipynb](tools/interactive_infer.ipynb) and [interactive_infer_8b.ipynb](tools/interactive_infer_8b.ipynb) for you to see more technical details about Infinity-2B & Infinity-8B.
 
 ## 📑 Open-Source Plan
   - [ ] Infinity-20B Checkpoints
+  - [x] Infinity-8B Checkpoints
   - [x] Training Code 
   - [x] Web Demo 
   - [x] Inference Code
@@ -86,16 +88,24 @@ We provide Infinity models for you to play with, which are on <a href='https://h
 |   model    | Resolution |   GenEval    | DPG | HPSv2.1 | HF weights🤗                                                                        |
 |:----------:|:-----:|:--------:|:---------:|:-------:|:------------------------------------------------------------------------------------|
 |  Infinity-2B   |  1024  |   0.69 / 0.73 $^{\dagger}$   |    83.5    |  32.2   | [infinity_2b_reg.pth](https://huggingface.co/FoundationVision/infinity/blob/main/infinity_2b_reg.pth) |
+|  Infinity-8B   |  1024  |  -  |    -    |  -   | [infinity_8b.pth](https://huggingface.co/FoundationVision/Infinity/tree/main/infinity_8b_weights) |
 |  Infinity-20B   |  1024  |  -  |    -    |  -   | [Coming Soon](TBD) |
 
 ${\dagger}$ result is tested with a [prompt rewriter](tools/prompt_rewriter.py). 
 
-You can load these models to generate images via the codes in [interactive_infer.ipynb](tools/interactive_infer.ipynb). Note: you need to download [infinity_vae_d32reg.pth](https://huggingface.co/FoundationVision/Infinity/blob/main/infinity_vae_d32reg.pth) and [flan-t5-xl](https://huggingface.co/google/flan-t5-xl) first.
+You can load these models to generate images via the codes in [interactive_infer.ipynb](tools/interactive_infer.ipynb) and [interactive_infer_8b.ipynb](tools/interactive_infer_8b.ipynb) .
 
 
 ## ⚽️ Installation
 1. We use FlexAttention to speedup training, which requires `torch>=2.5.1`.
 2. Install other pip packages via `pip3 install -r requirements.txt`.
+3. Donload weights from huggingface. Besides vae & transformers weights on <a href='https://huggingface.co/FoundationVision/infinity'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20weights-FoundationVision/Infinity-yellow'></a>, you should also download [flan-t5-xl](https://huggingface.co/google/flan-t5-xl).
+```
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")
+```
+These three lines will download flan-t5-xl to your ~/.cache/huggingface directory.
 
 ## 🎨 Data Preparation
 The structure of the training dataset is listed as bellow. The training dataset contains a list of json files with name "[h_div_w_template1]_[num_examples].jsonl". Here [h_div_w_template] is a float number, which is the template ratio of height to width of the image. [num_examples] is the number of examples where $h/w$ is around h_div_w_template. [dataset_t2i_iterable.py](infinity/dataset/dataset_t2i_iterable.py) supports traing with >100M examples. But we have to specify the number of examples for each h/w template ratio in the filename.
@@ -201,10 +211,6 @@ Infinity shows strong scaling capabilities as illustrated before. Thus we are en
 | a Chinese model is sitting on a train, magazine cover, clothes made of plastic, photorealistic, futuristic style, gray and green light, movie lighting, 32K HD      | ![](assets/2b_8b/3l.webp) | ![](assets/2b_8b/3r.webp) |
 | A  group of students in a class    | ![](assets/2b_20b/4l.jpg) | ![](assets/2b_8b/4r.webp) |
 
-
-
-Currently, Infinity-20B is still on the training phrase. We will release Infinity-20B once the training is completed.
-
 ## 📖 Citation
 If our work assists your research, feel free to give us a star ⭐ or cite us using:
 
diff --git a/tools/interactive_infer_8b.ipynb b/tools/interactive_infer_8b.ipynb
@@ -0,0 +1,205 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import torch\n",
+    "torch.cuda.set_device(2)\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "from tools.run_infinity import *\n",
+    "\n",
+    "model_path='weights/infinity_8b_weights'\n",
+    "vae_path='weights/infinity_vae_d56_f8_14_patchify.pth'\n",
+    "text_encoder_ckpt = 'weights/flan-t5-xl-official'\n",
+    "args=argparse.Namespace(\n",
+    "    pn='1M',\n",
+    "    model_path=model_path,\n",
+    "    cfg_insertion_layer=0,\n",
+    "    vae_type=14,\n",
+    "    vae_path=vae_path,\n",
+    "    add_lvl_embeding_only_first_block=1,\n",
+    "    use_bit_label=1,\n",
+    "    model_type='infinity_8b',\n",
+    "    rope2d_each_sa_layer=1,\n",
+    "    rope2d_normalized_by_hw=2,\n",
+    "    use_scale_schedule_embedding=0,\n",
+    "    sampling_per_bits=1,\n",
+    "    text_encoder_ckpt=text_encoder_ckpt,\n",
+    "    text_channels=2048,\n",
+    "    apply_spatial_patchify=1,\n",
+    "    h_div_w_template=1.000,\n",
+    "    use_flex_attn=0,\n",
+    "    cache_dir='/dev/shm',\n",
+    "    checkpoint_type='torch_shard',\n",
+    "    seed=0,\n",
+    "    bf16=1,\n",
+    "    save_file='tmp.jpg'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Loading tokenizer and text encoder]\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3f68ce998b1546f185e6263884b382ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Loading Infinity]\n",
+      "self.codebook_dim: 56, self.add_lvl_embeding_only_first_block: 1,             self.use_bit_label: 1, self.rope2d_each_sa_layer: 1, self.rope2d_normalized_by_hw: 2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/mnt/bn/foundation-vision/hanjian.thu123/infinity/pub_release/Infinity/tools/run_infinity.py:179: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True), torch.no_grad():\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "self.num_blocks_in_a_chunk=5, depth=40, block_chunks=8\n",
+      "\n",
+      "[constructor]  ==== customized_flash_attn=False (using_flash=0/40), fused_mlp=False (fused_mlp=0/40) ==== \n",
+      "    [Infinity config ] embed_dim=3584, num_heads=28, depth=40, mlp_ratio=4, swiglu=False num_blocks_in_a_chunk=5\n",
+      "    [drop ratios] drop_rate=0.0, drop_path_rate=0.1 (tensor([0.0000, 0.0026, 0.0051, 0.0077, 0.0103, 0.0128, 0.0154, 0.0179, 0.0205,\n",
+      "        0.0231, 0.0256, 0.0282, 0.0308, 0.0333, 0.0359, 0.0385, 0.0410, 0.0436,\n",
+      "        0.0462, 0.0487, 0.0513, 0.0538, 0.0564, 0.0590, 0.0615, 0.0641, 0.0667,\n",
+      "        0.0692, 0.0718, 0.0744, 0.0769, 0.0795, 0.0821, 0.0846, 0.0872, 0.0897,\n",
+      "        0.0923, 0.0949, 0.0974, 0.1000]))\n",
+      "\n",
+      "[you selected Infinity with model_kwargs={'depth': 40, 'embed_dim': 3584, 'num_heads': 28, 'drop_path_rate': 0.1, 'mlp_ratio': 4, 'block_chunks': 8}] model size: 8.38B, bf16=1\n",
+      "[Load Infinity weights]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load text encoder\n",
+    "text_tokenizer, text_encoder = load_tokenizer(t5_path=args.text_encoder_ckpt)\n",
+    "# load vae\n",
+    "vae = load_visual_tokenizer(args)\n",
+    "# load infinity\n",
+    "infinity = load_transformer(vae, args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "prompt=a cat holds a board with the text 'diffusion is dead'\n",
+      "cfg: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], tau: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/mnt/bn/foundation-vision/hanjian.thu123/infinity/pub_release/Infinity/tools/run_infinity.py:112: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True):\n",
+      "/mnt/bn/foundation-vision/hanjian.thu123/infinity/pub_release/Infinity/infinity/models/basic.py:495: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with torch.cuda.amp.autocast(enabled=False):    # disable half precision\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cost: 1.7465496063232422, infinity cost=1.7265434265136719\n",
+      "Save to /mnt/bn/foundation-vision/hanjian.thu123/infinity/pub_release/Infinity/tools/ipynb_tmp.jpg\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompt = \"\"\"a cat holds a board with the text 'diffusion is dead'\"\"\"\n",
+    "cfg = 3\n",
+    "tau = 1.0\n",
+    "h_div_w = 1/1 # aspect ratio, height:width\n",
+    "seed = random.randint(0, 10000)\n",
+    "enable_positive_prompt=0\n",
+    "\n",
+    "h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates-h_div_w))]\n",
+    "scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']\n",
+    "scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]\n",
+    "generated_image = gen_one_img(\n",
+    "    infinity,\n",
+    "    vae,\n",
+    "    text_tokenizer,\n",
+    "    text_encoder,\n",
+    "    prompt,\n",
+    "    g_seed=seed,\n",
+    "    gt_leak=0,\n",
+    "    gt_ls_Bl=None,\n",
+    "    cfg_list=cfg,\n",
+    "    tau_list=tau,\n",
+    "    scale_schedule=scale_schedule,\n",
+    "    cfg_insertion_layer=[args.cfg_insertion_layer],\n",
+    "    vae_type=args.vae_type,\n",
+    "    sampling_per_bits=args.sampling_per_bits,\n",
+    "    enable_positive_prompt=enable_positive_prompt,\n",
+    ")\n",
+    "args.save_file = 'ipynb_tmp.jpg'\n",
+    "os.makedirs(osp.dirname(osp.abspath(args.save_file)), exist_ok=True)\n",
+    "cv2.imwrite(args.save_file, generated_image.cpu().numpy())\n",
+    "print(f'Save to {osp.abspath(args.save_file)}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "fileId": "8ac263ab-b18c-41dc-b409-0fb0f32525f0",
+  "filePath": "/mnt/bn/foundation-vision/hanjian.thu123/infinity/infinity/tools/interactive_infer.ipynb",
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tools/run_infinity.py b/tools/run_infinity.py
@@ -172,6 +172,7 @@ def load_infinity(
     apply_spatial_patchify=0,
     use_flex_attn=False,
     bf16=False,
+    checkpoint_type='torch',
 ):
     print(f'[Loading Infinity]')
     text_maxlen = 512
@@ -207,8 +208,12 @@ def load_infinity(
         torch.cuda.empty_cache()
 
         print(f'[Load Infinity weights]')
-        state_dict = torch.load(model_path, map_location=device)
-        print(infinity_test.load_state_dict(state_dict))
+        if checkpoint_type == 'torch':
+            state_dict = torch.load(model_path, map_location=device)
+            print(infinity_test.load_state_dict(state_dict))
+        elif checkpoint_type == 'torch_shard':
+            from transformers.modeling_utils import load_sharded_checkpoint
+            load_sharded_checkpoint(infinity_test, model_path, strict=False)
         infinity_test.rng = torch.Generator(device=device)
         return infinity_test
 
@@ -252,7 +257,7 @@ def joint_vi_vae_encode_decode(vae, image_path, scale_schedule, device, tgt_h, t
 def load_visual_tokenizer(args):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     # load vae
-    if args.vae_type in [16,18,20,24,32,64]:
+    if args.vae_type in [14,16,18,20,24,32,64]:
         from infinity.models.bsq_vae.vae import vae_model
         schedule_mode = "dynamic"
         codebook_dim = args.vae_type
@@ -304,9 +309,13 @@ def load_transformer(vae, args):
         else:
             slim_model_path = model_path
         print(f'load checkpoint from {slim_model_path}')
+    elif args.checkpoint_type == 'torch_shard':
+        slim_model_path = model_path
 
     if args.model_type == 'infinity_2b':
         kwargs_model = dict(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8) # 2b model
+    elif args.model_type == 'infinity_8b':
+        kwargs_model = dict(depth=40, embed_dim=3584, num_heads=28, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8)
     elif args.model_type == 'infinity_layer12':
         kwargs_model = dict(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
     elif args.model_type == 'infinity_layer16':
@@ -335,6 +344,7 @@ def load_transformer(vae, args):
         apply_spatial_patchify=args.apply_spatial_patchify,
         use_flex_attn=args.use_flex_attn,
         bf16=args.bf16,
+        checkpoint_type=args.checkpoint_type,
     )
     return infinity