Added "system_prompt" input to Gemini nodes (Comfy-Org#11177)

bigcat88 · web-flow · commit c3c6313fc7b2 · 2025-12-08T01:28:17.000-08:00
diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py
@@ -84,15 +84,7 @@ class GeminiSystemInstructionContent(BaseModel):
         description="A list of ordered parts that make up a single message. "
         "Different parts may have different IANA MIME types.",
     )
-    role: GeminiRole = Field(
-        ...,
-        description="The identity of the entity that creates the message. "
-        "The following values are supported: "
-        "user: This indicates that the message is sent by a real person, typically a user-generated message. "
-        "model: This indicates that the message is generated by the model. "
-        "The model value is used to insert messages from model into the conversation during multi-turn conversations. "
-        "For non-multi-turn conversations, this field can be left blank or unset.",
-    )
+    role: GeminiRole | None = Field(..., description="The role field of systemInstruction may be ignored.")
 
 
 class GeminiFunctionDeclaration(BaseModel):
diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py
@@ -26,6 +26,8 @@
     GeminiMimeType,
     GeminiPart,
     GeminiRole,
+    GeminiSystemInstructionContent,
+    GeminiTextPart,
     Modality,
 )
 from comfy_api_nodes.util import (
@@ -42,6 +44,14 @@
 
 GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini"
 GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024  # 20 MB
+GEMINI_IMAGE_SYS_PROMPT = (
+    "You are an expert image-generation engine. You must ALWAYS produce an image.\n"
+    "Interpret all user input—regardless of "
+    "format, intent, or abstraction—as literal visual directives for image composition.\n"
+    "If a prompt is conversational or lacks specific visual details, "
+    "you must creatively invent a concrete visual scenario that depicts the concept.\n"
+    "Prioritize generating the visual representation above any text, formatting, or conversational requests."
+)
 
 
 class GeminiModel(str, Enum):
@@ -276,6 +286,13 @@ def define_schema(cls):
                     tooltip="Optional file(s) to use as context for the model. "
                     "Accepts inputs from the Gemini Generate Content Input Files node.",
                 ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default="",
+                    optional=True,
+                    tooltip="Foundational instructions that dictate an AI's behavior.",
+                ),
             ],
             outputs=[
                 IO.String.Output(),
@@ -348,6 +365,7 @@ async def execute(
         audio: Input.Audio | None = None,
         video: Input.Video | None = None,
         files: list[GeminiPart] | None = None,
+        system_prompt: str = "",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=False)
 
@@ -364,7 +382,10 @@ async def execute(
         if files is not None:
             parts.extend(files)
 
-        # Create response
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
         response = await sync_op(
             cls,
             endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
@@ -374,7 +395,8 @@ async def execute(
                         role=GeminiRole.user,
                         parts=parts,
                     )
-                ]
+                ],
+                systemInstruction=gemini_system_prompt,
             ),
             response_model=GeminiGenerateContentResponse,
             price_extractor=calculate_tokens_price,
@@ -524,6 +546,13 @@ def define_schema(cls):
                     "'IMAGE+TEXT' to return both the generated image and a text response.",
                     optional=True,
                 ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default=GEMINI_IMAGE_SYS_PROMPT,
+                    optional=True,
+                    tooltip="Foundational instructions that dictate an AI's behavior.",
+                ),
             ],
             outputs=[
                 IO.Image.Output(),
@@ -547,6 +576,7 @@ async def execute(
         files: list[GeminiPart] | None = None,
         aspect_ratio: str = "auto",
         response_modalities: str = "IMAGE+TEXT",
+        system_prompt: str = "",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=True, min_length=1)
         parts: list[GeminiPart] = [GeminiPart(text=prompt)]
@@ -560,6 +590,10 @@ async def execute(
         if files is not None:
             parts.extend(files)
 
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
         response = await sync_op(
             cls,
             endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
@@ -571,6 +605,7 @@ async def execute(
                     responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                     imageConfig=None if aspect_ratio == "auto" else image_config,
                 ),
+                systemInstruction=gemini_system_prompt,
             ),
             response_model=GeminiGenerateContentResponse,
             price_extractor=calculate_tokens_price,
@@ -641,6 +676,13 @@ def define_schema(cls):
                     tooltip="Optional file(s) to use as context for the model. "
                     "Accepts inputs from the Gemini Generate Content Input Files node.",
                 ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default=GEMINI_IMAGE_SYS_PROMPT,
+                    optional=True,
+                    tooltip="Foundational instructions that dictate an AI's behavior.",
+                ),
             ],
             outputs=[
                 IO.Image.Output(),
@@ -665,6 +707,7 @@ async def execute(
         response_modalities: str,
         images: Input.Image | None = None,
         files: list[GeminiPart] | None = None,
+        system_prompt: str = "",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=True, min_length=1)
 
@@ -680,6 +723,10 @@ async def execute(
         if aspect_ratio != "auto":
             image_config.aspectRatio = aspect_ratio
 
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
         response = await sync_op(
             cls,
             ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
@@ -691,6 +738,7 @@ async def execute(
                     responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                     imageConfig=image_config,
                 ),
+                systemInstruction=gemini_system_prompt,
             ),
             response_model=GeminiGenerateContentResponse,
             price_extractor=calculate_tokens_price,