MigoXLab · seancoding-day · Oct 15, 2025 · Oct 17, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/webqa_agent/actions/action_executor.py b/webqa_agent/actions/action_executor.py
diff --git a/webqa_agent/actions/action_handler.py b/webqa_agent/actions/action_handler.py
diff --git a/webqa_agent/llm/prompt.py b/webqa_agent/llm/prompt.py
@@ -14,12 +14,11 @@ class LLMPrompt:
 
     ## Context Provided
     - **`pageDescription (interactive elements)`**: A map of all interactive elements on the page, each with a unique ID. Use these IDs for actions.
-    - **`page_structure (full text content)`**: The complete text content of the page, including non-interactive elements.
     - **`Screenshot`**: A visual capture of the current page state.
 
     ## Objective
     - Decompose the user's instruction into a **series of actionable steps**, each representing a single UI interaction.
-    - **Unified Context Analysis**: You MUST analyze BOTH `pageDescription` and `page_structure` together. Use `page_structure` to understand the meaning and context of the interactive elements in `pageDescription` (e.g., matching a label to a nearby input field). This unified view is critical for making correct decisions.
+    - **Unified Context Analysis**: Analyze the `pageDescription` together with the visual `Screenshot`. Use the screenshot to understand the spatial layout and context of the interactive elements (e.g., matching a label to a nearby input field based on their visual positions). This unified view is critical for making correct decisions.
     - Identify and locate the target element if applicable.
     - Validate if the planned target matches the user's intent, especially in cases of **duplicate or ambiguous elements**.
     - Avoid redundant operations such as repeated scrolling or re-executing completed steps.
@@ -187,8 +186,8 @@ class LLMPrompt:
     - Example: if you see element '1' with internal id 917, use "id": "1" in your action
 
     ### Contextual Decision Making:
-    - **Crucially, use the `page_structure` (full text content) to understand the context of the interactive elements from `pageDescription`**. For example, if `page_structure` shows "Username:" next to an input field, you know that input field is for the username.
-    - If you see error text like "Invalid email format" in `page_structure`, use this information to correct your next action.
+    - **Crucially, use the `Screenshot` to understand the context of the interactive elements from `pageDescription`**. For example, if the screenshot shows "Username:" next to an input field, you know that input field is for the username.
+    - If you see error text like "Invalid email format" in the screenshot, use this information to correct your next action.
 
     ### Supported Actions:
     - Tap: Click on a specified page element (such as a button or link). Typically used to trigger a click event.

diff --git a/webqa_agent/testers/case_gen/graph.py b/webqa_agent/testers/case_gen/graph.py
@@ -85,30 +85,25 @@ async def plan_test_cases(state: MainGraphState) -> Dict[str, List[Dict[str, Any
     logging.info(f"Deep crawling page structure and elements for initial test plan...")
     page = await ui_tester.get_current_page()
     dp = DeepCrawler(page)
-    await dp.crawl(highlight=True, viewport_only=True)
+    await dp.crawl(highlight=True, viewport_only=False)
     screenshot = await ui_tester._actions.b64_page_screenshot(
-        file_name="plan_or_replan", save_to_log=False, full_page=False
+        file_name="plan_or_replan", save_to_log=False, full_page=True
     )
     await dp.remove_marker()
-    await dp.crawl(highlight=False, filter_text=True, viewport_only=True)
+    await dp.crawl(highlight=False, filter_text=True, viewport_only=False)
     page_structure = dp.get_text()
     logging.debug(f"----- plan cases ---- Page structure: {page_structure}")
 
     business_objectives = state.get("business_objectives", "No specific business objectives provided.")
-    completed_cases = state.get("completed_cases")
 
     language = state.get('language', 'zh-CN')
     system_prompt = get_test_case_planning_system_prompt(
         business_objectives=business_objectives,
-        completed_cases=completed_cases,
         language=language,
     )
 
     user_prompt = get_test_case_planning_user_prompt(
         state_url=state["url"],
-        completed_cases=completed_cases,
-        reflection_history=state.get("reflection_history"),
-        remaining_objectives=state.get("remaining_objectives"),
     )
 
     logging.info("Generating initial test plan - Sending request to LLM...")
@@ -283,7 +278,7 @@ async def reflect_and_replan(state: MainGraphState) -> dict:
     # Use DeepCrawler to get interactive elements mapping and highlighted screenshot
     logging.info(f"Deep crawling page structure and elements for reflection and replanning analysis...")
     dp = DeepCrawler(page)
-    curr = await dp.crawl(highlight=True, viewport_only=True)
+    curr = await dp.crawl(highlight=True, viewport_only=False)
     # Include position information for better replanning decisions
     reflect_template = [
         str(ElementKey.TAG_NAME),
@@ -294,9 +289,9 @@ async def reflect_and_replan(state: MainGraphState) -> dict:
     ]
     page_content_summary = curr.clean_dict(reflect_template)
     logging.debug(f"current page crawled result: {page_content_summary}")
-    screenshot = await ui_tester._actions.b64_page_screenshot(file_name="reflection", save_to_log=False, full_page=False)
+    screenshot = await ui_tester._actions.b64_page_screenshot(file_name="reflection", save_to_log=False, full_page=True)
     await dp.remove_marker()
-    await dp.crawl(highlight=False, filter_text=True, viewport_only=True)
+    await dp.crawl(highlight=False, filter_text=True, viewport_only=False)
     page_structure = dp.get_text()
     logging.debug(f"----- reflection ---- Page structure: {page_structure}")
 

diff --git a/webqa_agent/testers/case_gen/prompts/agent_prompts.py b/webqa_agent/testers/case_gen/prompts/agent_prompts.py
@@ -27,6 +27,12 @@ def get_execute_system_prompt(case: dict) -> str:
 - **Layout Comprehension**: Analyze the layout to understand the spatial relationship between elements, which is crucial for complex interactions.
 - **Anomaly Detection**: Identify unexpected visual states like error pop-ups, unloaded content, or graphical glitches that may not be present in the text structure.
 
+**IMPORTANT - Automatic Viewport Management**:
+The system automatically handles element visibility through intelligent scrolling. When you interact with elements (click, hover, type), the system will automatically scroll to ensure the element is in the viewport before performing the action. You do NOT need to manually scroll to elements or worry about elements being outside the visible area. Simply reference elements by their identifiers, and the system will handle viewport positioning automatically.
+
+**IMPORTANT - Screenshot Context**:
+The screenshots you receive during test execution show ONLY the current viewport (visible portion of the page), not the entire webpage. While test planning may reference elements from full-page screenshots, your execution screenshots are viewport-limited. This is intentional - the automatic viewport management system ensures that any element you need to interact with will be scrolled into the viewport before your action executes. If you cannot see an element in the current screenshot but it was referenced in the test plan, trust that the system will handle the scrolling automatically.
+
 ## Available Tools
 You have access to two specialized testing tools:
 
@@ -281,6 +287,43 @@ def get_execute_system_prompt(case: dict) -> str:
 2. Check for dynamic content appearance
 3. Retry interaction after content stabilization
 
+### Pattern 5: Automatic Scroll Management Failures
+**Scenario**: Element interaction fails due to scroll or viewport positioning issues
+**Recognition Signals**:
+- Error messages containing "element not in viewport", "not visible", "not clickable", or "scroll failed"
+- Element was referenced in test plan from full-page screenshot but not visible in current viewport
+- Interaction timeout errors for elements that should exist
+
+**Understanding the Issue**:
+The system uses automatic viewport management with intelligent scrolling. When you interact with elements (click, hover, type), the system automatically scrolls to ensure the element is in viewport BEFORE executing your action. This process:
+1. Detects if the target element is outside viewport
+2. Attempts scroll using CSS selector → XPath → coordinate-based fallback
+3. Implements retry logic for lazy-loaded content (up to 3 attempts)
+4. Waits for page stability after scroll (handles infinite scroll and dynamic loading)
+
+**Recovery Solution**:
+If automatic scroll fails, the error will indicate the specific issue:
+1. **Element Not Found**: Element may not exist yet due to lazy loading
+   - Use `execute_ui_action(action='Sleep', value='2000')` to wait for content to load
+   - Verify element identifier is correct by checking page structure
+   - Consider that element may appear conditionally based on previous actions
+
+2. **Scroll Timeout**: Page is loading slowly or has infinite scroll
+   - Increase wait time: `execute_ui_action(action='Sleep', value='3000')`
+   - Manually trigger scroll if needed: `execute_ui_action(action='Scroll', value='down')`
+   - Check for loading spinners or progress indicators
+
+3. **Element Obscured**: Element exists but is covered by another element (modal, overlay, popup)
+   - Close the obscuring element first (dismiss modal, close popup)
+   - Use `execute_ui_action(action='KeyboardPress', value='Escape')` to dismiss overlays
+   - Verify no sticky headers or floating elements are blocking the target
+
+**Important Notes**:
+- You do NOT need to manually scroll in normal circumstances - the system handles this automatically
+- Only use manual scroll actions when automatic scroll explicitly fails with error messages
+- If you see an error about scroll failure, report it as-is - these are rare and indicate system issues
+- Trust the automatic viewport management for elements referenced from full-page planning screenshots
+
 ## Test Execution Examples
 
 ### Example 1: Form Field Validation Recovery
@@ -330,6 +373,29 @@ def get_execute_system_prompt(case: dict) -> str:
 **Tool Response**: `[SUCCESS] Action 'Input' on 'username field' completed successfully`
 **Agent Reporting**: Report completion of the single action and allow framework to proceed to next step
 
+### Example 8: Mouse Action - Cursor Positioning
+**Context**: Drawing canvas requiring precise cursor positioning
+**Action**: `execute_ui_action(action='Mouse', target='canvas drawing area', value='move:250,150', description='Position cursor at specific canvas coordinates for drawing')`
+**Tool Response**: `[SUCCESS] Action 'Mouse' on 'canvas drawing area' completed successfully. Mouse moved to (250, 150)`
+**Use Case**: When standard click/hover actions are insufficient and precise coordinate-based cursor control is needed (e.g., drawing tools, custom interactive visualizations, coordinate-based maps)
+
+### Example 9: Mouse Action - Wheel Scrolling
+**Context**: Custom scrollable container with horizontal scroll
+**Action**: `execute_ui_action(action='Mouse', target='horizontal gallery container', value='wheel:100,0', description='Scroll gallery horizontally to the right')`
+**Tool Response**: `[SUCCESS] Action 'Mouse' on 'horizontal gallery container' completed successfully. Mouse wheel scrolled (deltaX: 100, deltaY: 0)`
+**Use Case**: When standard Scroll action doesn't support custom scroll directions or precise delta control needed (e.g., horizontal scrolling, custom scroll containers)
+
+### Example 10: Page Navigation Actions
+**Context 1 - Direct Navigation**: Navigate to specific URL for cross-site testing
+**Action**: `execute_ui_action(action='GoToPage', target='https://example.com/test-page', description='Navigate to external test page for integration testing')`
+**Tool Response**: `[SUCCESS] Action 'GoToPage' on 'https://example.com/test-page' completed successfully. Navigated to page`
+**Use Case**: Direct URL navigation for multi-site workflows, external authentication redirects, or testing cross-domain functionality
+
+**Context 2 - Browser Back**: Return to previous page after completing action
+**Action**: `execute_ui_action(action='GoBack', target='', description='Navigate back to main product listing page')`
+**Tool Response**: `[SUCCESS] Action 'GoBack' completed successfully. Successfully navigated back to previous page`
+**Use Case**: Test browser back button functionality, validate state preservation after navigation, or reset to previous page state
+
 ## Test Completion Protocol
 When all test steps are completed or an unrecoverable error occurs: