diff --git a/packages/jaaz_ui/package.json b/packages/jaaz_ui/package.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/react/src/api/upload.ts b/react/src/api/upload.ts
index d414fd5bf..da94c7494 100644
--- a/react/src/api/upload.ts
+++ b/react/src/api/upload.ts
@@ -1,4 +1,6 @@
-import { compressImageFile } from '@/utils/imageUtils'
+import { compressImageFile, fileToBase64 } from '@/utils/imageUtils'
+import { BASE_API_URL } from '../constants'
+import { authenticatedFetch } from './auth'
 
 export async function uploadImage(
   file: File
@@ -14,3 +16,55 @@ export async function uploadImage(
   })
   return await response.json()
 }
+
+/**
+ * Upload image to Jaaz server
+ * @param file - Image file to upload
+ * @returns Promise with the uploaded image URL
+ */
+export async function uploadImageToJaaz(file: File): Promise<string> {
+  try {
+    // Compress image before upload
+    const compressedFile = await compressImageFile(file)
+
+    // Convert file to base64
+    const base64Data = await fileToBase64(compressedFile)
+
+    // Prepare request body
+    const requestBody = {
+      base64Data: base64Data.split(',')[1], // Remove data:image/jpeg;base64, prefix
+      fileName: compressedFile.name,
+      contentType: compressedFile.type,
+    }
+
+    // Make authenticated request to Jaaz cloud API
+    const response = await authenticatedFetch(
+      `${BASE_API_URL}/api/v1/image/upload`,
+      {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify(requestBody),
+      }
+    )
+
+    if (!response.ok) {
+      const errorData = await response.json().catch(() => ({}))
+      throw new Error(
+        errorData.error || `Upload failed with status ${response.status}`
+      )
+    }
+
+    const result = await response.json()
+
+    if (!result.success || !result.data?.s3Url) {
+      throw new Error(result.error || 'Upload failed - no URL returned')
+    }
+
+    return result.data.s3Url
+  } catch (error) {
+    console.error('Failed to upload image to Jaaz:', error)
+    throw error
+  }
+}
diff --git a/react/src/components/chat/ChatTextarea.tsx b/react/src/components/chat/ChatTextarea.tsx
index d796508bc..df8a5b3bd 100644
--- a/react/src/components/chat/ChatTextarea.tsx
+++ b/react/src/components/chat/ChatTextarea.tsx
@@ -1,6 +1,6 @@
 import { cancelChat } from '@/api/chat'
 import { cancelMagicGenerate } from '@/api/magic'
-import { uploadImage } from '@/api/upload'
+import { uploadImage, uploadImageToJaaz } from '@/api/upload'
 import { Button } from '@/components/ui/button'
 import { useConfigs } from '@/contexts/configs'
 import {
@@ -20,7 +20,6 @@ import Textarea, { TextAreaRef } from 'rc-textarea'
 import { useCallback, useEffect, useRef, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { toast } from 'sonner'
-import ModelSelector from './ModelSelector'
 import ModelSelectorV2 from './ModelSelectorV2'
 import { useAuth } from '@/contexts/AuthContext'
 
@@ -54,33 +53,67 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
   const textareaRef = useRef<TextAreaRef>(null)
   const [images, setImages] = useState<
     {
-      file_id: string
+      file_id?: string
       width: number
       height: number
+      url?: string // S3 URL if uploaded to Jaaz
+    }[]
+  >([])
+  const [uploadingImages, setUploadingImages] = useState<
+    {
+      id: string
+      file: File
+      previewUrl: string
     }[]
   >([])
   const [isFocused, setIsFocused] = useState(false)
 
   const imageInputRef = useRef<HTMLInputElement>(null)
 
+  // New mutation that handles both local and Jaaz uploads based on login status
   const { mutate: uploadImageMutation } = useMutation({
-    mutationFn: (file: File) => uploadImage(file),
-    onSuccess: (data) => {
+    mutationFn: async (file: File) => {
+      // Upload to local server
+      const result = await uploadImage(file)
+      return { ...result, url: undefined, uploadId: file.name + Date.now() }
+    },
+    onMutate: (file: File) => {
+      // Add to uploading images immediately
+      const uploadId = file.name + Date.now()
+      const previewUrl = URL.createObjectURL(file)
+      setUploadingImages((prev) => [
+        ...prev,
+        { id: uploadId, file, previewUrl },
+      ])
+      return { uploadId }
+    },
+    onSuccess: (data, file, context) => {
       console.log('🦄uploadImageMutation onSuccess', data)
+      // Remove from uploading images
+      setUploadingImages((prev) =>
+        prev.filter((img) => img.id !== context?.uploadId)
+      )
+
+      // Add to completed images
       setImages((prev) => [
         ...prev,
         {
           file_id: data.file_id,
           width: data.width,
           height: data.height,
+          url: data.url,
         },
       ])
     },
-    onError: (error) => {
+    onError: (error, file, context) => {
       console.error('🦄uploadImageMutation onError', error)
       toast.error('Failed to upload image', {
         description: <div>{error.toString()}</div>,
       })
+      // Remove from uploading images on error
+      setUploadingImages((prev) =>
+        prev.filter((img) => img.id !== context?.uploadId)
+      )
     },
   })
 
@@ -119,19 +152,28 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
       toast.warning(t('chat:textarea.selectTool'))
     }
 
-    let value: MessageContent[] | string = prompt
+    let text_content: MessageContent[] | string = prompt
     if (prompt.length === 0 || prompt.trim() === '') {
       toast.error(t('chat:textarea.enterPrompt'))
       return
     }
 
+    // 使用XML格式让LLM更容易识别图片信息
     if (images.length > 0) {
-      images.forEach((image) => {
-        value += `\n\n ![Attached image - width: ${image.width} height: ${image.height} filename: ${image.file_id}](/api/file/${image.file_id})`
+      text_content += `\n\n<input_images count="${images.length}">`
+      images.forEach((image, index) => {
+        const imageId = image.file_id || `image-${index}`
+        text_content += `\n  <image index="${index + 1}" file_id="${imageId}" width="${image.width}" height="${image.height}" />`
       })
+      text_content += `\n</input_images>`
+      text_content += `\n\n<instruction>Please use the input_images as input for image generation or editing.</instruction>`
+    }
 
-      // Fetch images as base64
-      const imagePromises = images.map(async (image) => {
+    // 获取图片 base64
+    const imagePromises = images.map(async (image) => {
+      // console.log('🦄imagePromises', image)
+      if (image.file_id) {
+        // Get local URL and convert to base64
         const response = await fetch(`/api/file/${image.file_id}`)
         const blob = await response.blob()
         return new Promise<string>((resolve) => {
@@ -139,28 +181,30 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
           reader.onloadend = () => resolve(reader.result as string)
           reader.readAsDataURL(blob)
         })
-      })
+      } else {
+        throw new Error('Invalid image data')
+      }
+    })
 
-      const base64Images = await Promise.all(imagePromises)
+    const imageUrlList = await Promise.all(imagePromises)
 
-      value = [
-        {
-          type: 'text',
-          text: value,
+    const final_content = [
+      {
+        type: 'text',
+        text: text_content,
+      },
+      ...images.map((image, index) => ({
+        type: 'image_url',
+        image_url: {
+          url: imageUrlList[index],
         },
-        ...images.map((image, index) => ({
-          type: 'image_url',
-          image_url: {
-            url: base64Images[index],
-          },
-        })),
-      ] as MessageContent[]
-    }
+      })),
+    ] as MessageContent[]
 
     const newMessage = messages.concat([
       {
         role: 'user',
-        content: value,
+        content: final_content,
       },
     ])
 
@@ -262,6 +306,15 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
     }
   }, [uploadImageMutation])
 
+  // Cleanup object URLs to prevent memory leaks
+  useEffect(() => {
+    return () => {
+      uploadingImages.forEach((img) => {
+        URL.revokeObjectURL(img.previewUrl)
+      })
+    }
+  }, [uploadingImages])
+
   return (
     <motion.div
       ref={dropAreaRef}
@@ -299,7 +352,7 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
       </AnimatePresence>
 
       <AnimatePresence>
-        {images.length > 0 && (
+        {(images.length > 0 || uploadingImages.length > 0) && (
           <motion.div
             className="flex items-center gap-2 w-full"
             initial={{ opacity: 0, height: 0 }}
@@ -307,9 +360,45 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
             exit={{ opacity: 0, height: 0 }}
             transition={{ duration: 0.2, ease: 'easeInOut' }}
           >
-            {images.map((image) => (
+            {/* Show uploading images first */}
+            {uploadingImages.map((uploadingImage) => (
+              <motion.div
+                key={uploadingImage.id}
+                className="relative size-10"
+                initial={{ opacity: 0, scale: 0.95 }}
+                animate={{ opacity: 1, scale: 1 }}
+                exit={{ opacity: 0, scale: 0.95 }}
+                transition={{ duration: 0.2, ease: 'easeInOut' }}
+              >
+                <img
+                  src={uploadingImage.previewUrl}
+                  alt="Uploading image"
+                  className="w-full h-full object-cover rounded-md opacity-50"
+                  draggable={false}
+                />
+                {/* Upload spinner */}
+                <div className="absolute inset-0 flex items-center justify-center bg-black/20 rounded-md">
+                  <Loader2 className="size-4 animate-spin text-white" />
+                </div>
+                <Button
+                  variant="secondary"
+                  size="icon"
+                  className="absolute -top-1 -right-1 size-4"
+                  onClick={() =>
+                    setUploadingImages((prev) =>
+                      prev.filter((img) => img.id !== uploadingImage.id)
+                    )
+                  }
+                >
+                  <XIcon className="size-3" />
+                </Button>
+              </motion.div>
+            ))}
+
+            {/* Show completed images */}
+            {images.map((image, index) => (
               <motion.div
-                key={image.file_id}
+                key={image.file_id || `image-${index}`}
                 className="relative size-10"
                 initial={{ opacity: 0, scale: 0.95 }}
                 animate={{ opacity: 1, scale: 1 }}
@@ -317,8 +406,10 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
                 transition={{ duration: 0.2, ease: 'easeInOut' }}
               >
                 <img
-                  key={image.file_id}
-                  src={`/api/file/${image.file_id}`}
+                  src={
+                    image.url ||
+                    (image.file_id ? `/api/file/${image.file_id}` : '')
+                  }
                   alt="Uploaded image"
                   className="w-full h-full object-cover rounded-md"
                   draggable={false}
@@ -328,9 +419,7 @@ const ChatTextarea: React.FC<ChatTextareaProps> = ({
                   size="icon"
                   className="absolute -top-1 -right-1 size-4"
                   onClick={() =>
-                    setImages((prev) =>
-                      prev.filter((i) => i.file_id !== image.file_id)
-                    )
+                    setImages((prev) => prev.filter((_, i) => i !== index))
                   }
                 >
                   <XIcon className="size-3" />
diff --git a/react/src/components/chat/Message/Image.tsx b/react/src/components/chat/Message/Image.tsx
index dd1f0a72e..caf2c6cf2 100644
--- a/react/src/components/chat/Message/Image.tsx
+++ b/react/src/components/chat/Message/Image.tsx
@@ -26,16 +26,16 @@ const MessageImage = ({ content }: MessageImageProps) => {
     excalidrawAPI?.scrollToContent(id, { animate: true })
   }
   const id = filesArray.find((file) =>
-    content.image_url.url?.includes(file.url)
+    content.image_url?.url?.includes(file.url)
   )?.id
 
   return (
     <div>
-      <PhotoView src={content.image_url.url}>
+      <PhotoView src={content.image_url?.url}>
         <div className="relative">
           <img
             className="hover:scale-105 transition-transform duration-300"
-            src={content.image_url.url}
+            src={content.image_url?.url}
             alt="Image"
           />
 
diff --git a/react/src/utils/imageUtils.ts b/react/src/utils/imageUtils.ts
index 8a901a236..ca200fb2e 100644
--- a/react/src/utils/imageUtils.ts
+++ b/react/src/utils/imageUtils.ts
@@ -12,7 +12,7 @@ interface ProcessedImage {
 /**
  * Convert file to base64 data URL
  */
-function fileToBase64(file: File): Promise<string> {
+export function fileToBase64(file: File): Promise<string> {
   return new Promise((resolve, reject) => {
     const reader = new FileReader()
     reader.onload = () => resolve(reader.result as string)
diff --git a/server/services/langgraph_service/configs/image_vide_creator_config.py b/server/services/langgraph_service/configs/image_vide_creator_config.py
index 1a14a39ea..ee0045b8c 100644
--- a/server/services/langgraph_service/configs/image_vide_creator_config.py
+++ b/server/services/langgraph_service/configs/image_vide_creator_config.py
@@ -43,8 +43,23 @@
 3. If it is a video generation task, use video generation tools to generate the video. You can choose to generate the necessary images first, and then use the images to generate the video, or directly generate the video using text prompt.
 """
 
+
 class ImageVideoCreatorAgentConfig(BaseAgentConfig):
     def __init__(self, tool_list: List[ToolInfoJson]) -> None:
+
+        image_input_detection_prompt = """
+IMAGE INPUT DETECTION:
+When the user's message contains input images in XML format like:
+<input_images></input_images>
+
+You MUST:
+1. Parse the XML to extract file_id attributes from <image> tags
+2. Use tools that support input_images parameter when images are present
+3. Pass the extracted file_id(s) in the input_images parameter as a list
+4. If input_images count > 1 , only use generate_image_by_gpt_image_1_jaaz (supports multiple images)
+5. For video generation → use video tools with input_images if images are present
+"""
+
         batch_generation_prompt = """
 
 BATCH GENERATION RULES:
@@ -74,6 +89,7 @@ def __init__(self, tool_list: List[ToolInfoJson]) -> None:
 """
 
         full_system_prompt = system_prompt + \
+            image_input_detection_prompt + \
             batch_generation_prompt + error_handling_prompt
 
         # 图像设计智能体不需要切换到其他智能体
diff --git a/server/services/langgraph_service/configs/planner_config.py b/server/services/langgraph_service/configs/planner_config.py
index 0825095a5..de93f1fd2 100644
--- a/server/services/langgraph_service/configs/planner_config.py
+++ b/server/services/langgraph_service/configs/planner_config.py
@@ -9,8 +9,9 @@ class PlannerAgentConfig(BaseAgentConfig):
     def __init__(self) -> None:
         system_prompt = """
             You are a design planning writing agent. Answer and write plan in the SAME LANGUAGE as the user's prompt. You should do:
-            - Step 1. If it is a complex task requiring multiple steps, write a execution plan for the user's request using the SAME LANGUAGE AS THE USER'S PROMPT. You should breakdown the task into high level steps for the other agents to execute.
-            - Step 2. If it is a image/video generation or editing task, transfer the task to image_video_creator agent to generate the image based on the plan IMMEDIATELY, no need to ask for user's approval.
+            - Step 1. If user provides input images, first analyze the images to understand their content, style, and elements before planning.
+            - Step 2. If it is a complex task requiring multiple steps, write a execution plan for the user's request using the SAME LANGUAGE AS THE USER'S PROMPT. You should breakdown the task into high level steps for the other agents to execute.
+            - Step 3. If it is a image/video generation or editing task, transfer the task to image_video_creator agent to generate the image based on the plan IMMEDIATELY, no need to ask for user's approval.
 
             IMPORTANT RULES:
             1. You MUST complete the write_plan tool call and wait for its result BEFORE attempting to transfer to another agent
diff --git a/server/tools/generate_image_by_flux_kontext_max_jaaz.py b/server/tools/generate_image_by_flux_kontext_max_jaaz.py
index 050e7a153..5c84c1f3b 100644
--- a/server/tools/generate_image_by_flux_kontext_max_jaaz.py
+++ b/server/tools/generate_image_by_flux_kontext_max_jaaz.py
@@ -11,9 +11,9 @@ class GenerateImageByFluxKontextMaxInputSchema(BaseModel):
     aspect_ratio: str = Field(
         description="Required. Aspect ratio of the image, only these values are allowed: 1:1, 16:9, 4:3, 3:4, 9:16. Choose the best fitting aspect ratio according to the prompt. Best ratio for posters is 3:4"
     )
-    input_image: str | None = Field(
+    input_images: list[str] | None = Field(
         default=None,
-        description="Optional; Image to use as reference. Pass an image_id here, e.g. 'im_jurheut7.png'. Best for image editing cases like: Editing specific parts of the image, Removing specific objects, Maintaining visual elements across scenes (character/object consistency), Generating new content in the style of the reference (style transfer), etc."
+        description="Optional; Image to use as reference. Only one image is allowed, e.g. ['im_jurheut7.png']. Best for image editing cases like: Editing specific parts of the image, Removing specific objects, Maintaining visual elements across scenes (character/object consistency), Generating new content in the style of the reference (style transfer), etc."
     )
     tool_call_id: Annotated[str, InjectedToolCallId]
 
@@ -26,7 +26,7 @@ async def generate_image_by_flux_kontext_max(
     aspect_ratio: str,
     config: RunnableConfig,
     tool_call_id: Annotated[str, InjectedToolCallId],
-    input_image: str | None = None,
+    input_images: list[str] | None = None,
 ) -> str:
     """
     Generate an image using Flux Kontext Max model via the provider framework
@@ -42,7 +42,7 @@ async def generate_image_by_flux_kontext_max(
         model="black-forest-labs/flux-kontext-max",
         prompt=prompt,
         aspect_ratio=aspect_ratio,
-        input_images=[input_image] if input_image else None,
+        input_images=input_images,
     )
 
 # Export the tool for easy import
diff --git a/server/tools/generate_image_by_flux_kontext_pro_jaaz.py b/server/tools/generate_image_by_flux_kontext_pro_jaaz.py
index b80232478..3d926e147 100644
--- a/server/tools/generate_image_by_flux_kontext_pro_jaaz.py
+++ b/server/tools/generate_image_by_flux_kontext_pro_jaaz.py
@@ -12,9 +12,9 @@ class GenerateImageByFluxKontextProInputSchema(BaseModel):
     aspect_ratio: str = Field(
         description="Required. Aspect ratio of the image, only these values are allowed: 1:1, 16:9, 4:3, 3:4, 9:16. Choose the best fitting aspect ratio according to the prompt. Best ratio for posters is 3:4"
     )
-    input_image: str | None = Field(
+    input_images: list[str] | None = Field(
         default=None,
-        description="Optional; Image to use as reference. Pass an image_id here, e.g. 'im_jurheut7.png'. Best for image editing cases like: Editing specific parts of the image, Removing specific objects, Maintaining visual elements across scenes (character/object consistency), Generating new content in the style of the reference (style transfer), etc."
+        description="Optional; Image to use as reference. Only one image is allowed, e.g. ['im_jurheut7.png']. Best for image editing cases like: Editing specific parts of the image, Removing specific objects, Maintaining visual elements across scenes (character/object consistency), Generating new content in the style of the reference (style transfer), etc."
     )
     tool_call_id: Annotated[str, InjectedToolCallId]
 
@@ -27,7 +27,7 @@ async def generate_image_by_flux_kontext_pro_jaaz(
     aspect_ratio: str,
     config: RunnableConfig,
     tool_call_id: Annotated[str, InjectedToolCallId],
-    input_image: str | None = None,
+    input_images: list[str] | None = None,
 ) -> str:
     ctx = config.get('configurable', {})
     canvas_id = ctx.get('canvas_id', '')
@@ -39,7 +39,7 @@ async def generate_image_by_flux_kontext_pro_jaaz(
         model='black-forest-labs/flux-kontext-pro',
         prompt=prompt,
         aspect_ratio=aspect_ratio,
-        input_images=[input_image] if input_image else None,
+        input_images=input_images,
     )
 
 # Export the tool for easy import
diff --git a/server/tools/generate_image_by_gpt_image_1_jaaz.py b/server/tools/generate_image_by_gpt_image_1_jaaz.py
index b28d8a08d..7fdb86a46 100644
--- a/server/tools/generate_image_by_gpt_image_1_jaaz.py
+++ b/server/tools/generate_image_by_gpt_image_1_jaaz.py
@@ -14,7 +14,7 @@ class GenerateImageByGptImage1InputSchema(BaseModel):
     )
     input_images: list[str] | None = Field(
         default=None,
-        description="Optional; One or multiple images to use as reference. Pass a list of image_id here, e.g. ['im_jurheut7.png', 'im_hfuiut78.png']. Best for image editing cases like: Editing specific parts of the image, Removing specific objects, Maintaining visual elements across scenes (character/object consistency), Generating new content in the style of the reference (style transfer), etc."
+        description="Optional; Multiple images are allowed. Pass a list of image_id here, e.g. ['im_jurheut7.png', 'im_hfuiut78.png']. Best for image editing cases like: Editing specific parts of the image, Removing specific objects, Maintaining visual elements across scenes (character/object consistency), Generating new content in the style of the reference (style transfer), etc."
     )
     tool_call_id: Annotated[str, InjectedToolCallId]
 
diff --git a/server/tools/generate_video_by_seedance_v1_jaaz.py b/server/tools/generate_video_by_seedance_v1_jaaz.py
index c9c51d79c..12543ab11 100644
--- a/server/tools/generate_video_by_seedance_v1_jaaz.py
+++ b/server/tools/generate_video_by_seedance_v1_jaaz.py
@@ -24,7 +24,7 @@ class GenerateVideoBySeedanceV1InputSchema(BaseModel):
     )
     input_images: list[str] | None = Field(
         default=None,
-        description="Optional. Images to use as reference or first frame. Pass a list of image_id here, e.g. ['im_jurheut7.png']."
+        description="Optional. Images to use as reference or first frame. Only one image is allowed, e.g. ['im_jurheut7.png']."
     )
     camera_fixed: bool = Field(
         default=True,