diff --git a/src/LlmContentEditor/Domain/Agent/ContentEditorAgent.php b/src/LlmContentEditor/Domain/Agent/ContentEditorAgent.php index 468552a4..78da06a0 100644 --- a/src/LlmContentEditor/Domain/Agent/ContentEditorAgent.php +++ b/src/LlmContentEditor/Domain/Agent/ContentEditorAgent.php @@ -200,6 +200,18 @@ protected function tools(): array ) )->setCallable(fn (string $url): string => $this->sitebuilderFacade->getRemoteAssetInfo($url)), + Tool::make( + 'fetch_remote_web_page', + 'Fetch textual content from a remote web page via cURL. Use this when the user asks to inspect, summarize, adapt, or copy content from an external URL. Returns JSON with response metadata and page content; on failure returns JSON with an "error" key.' + )->addProperty( + new ToolProperty( + 'url', + PropertyType::STRING, + 'The absolute URL to fetch (http or https).', + true + ) + )->setCallable(fn (string $url): string => $this->sitebuilderFacade->fetchRemoteWebPage($url)), + Tool::make( 'get_workspace_rules', 'Get project-specific rules from .sitebuilder/rules/ folders. Returns a JSON object where keys are rule names (filename without .md extension) and values are the rule contents (Markdown text). IMPORTANT: You must call this tool at least once at the start of every session to understand project-specific conventions and requirements.' diff --git a/src/LlmContentEditor/Infrastructure/ProgressMessageResolver.php b/src/LlmContentEditor/Infrastructure/ProgressMessageResolver.php index 4840259d..94368a92 100644 --- a/src/LlmContentEditor/Infrastructure/ProgressMessageResolver.php +++ b/src/LlmContentEditor/Infrastructure/ProgressMessageResolver.php @@ -52,6 +52,7 @@ private function messageForToolCalling(AgentEventDto $event, string $locale): ?s 'list_remote_content_asset_urls' => 'fetching_remote_asset_urls', 'search_remote_content_asset_urls' => 'searching_remote_assets', 'get_remote_asset_info' => 'getting_remote_asset_info', + 'fetch_remote_web_page' => 'fetching_remote_web_page', 'suggest_commit_message' => 'suggesting_commit_message', 'get_preview_url' => $label !== null ? 'getting_preview_url' : 'getting_preview_url_only', default => $label !== null ? 'running_tool_on' : null, diff --git a/src/ProjectMgmt/Domain/ValueObject/AgentConfigTemplate.php b/src/ProjectMgmt/Domain/ValueObject/AgentConfigTemplate.php index 69db55a1..a4321d3c 100644 --- a/src/ProjectMgmt/Domain/ValueObject/AgentConfigTemplate.php +++ b/src/ProjectMgmt/Domain/ValueObject/AgentConfigTemplate.php @@ -70,6 +70,11 @@ private static function defaultTemplate(): self - Call list_remote_content_asset_urls to get a JSON array of all remote asset URLs configured for this project. Use these URLs directly (e.g. in img src). If the tool returns an empty array, no remote manifests are configured. - Call get_remote_asset_info with a URL to retrieve metadata (width, height, mimeType, sizeInBytes) for a remote image without downloading it. Use this when you need dimensions or format for embedding. +REMOTE WEB PAGES: +- If the user asks you to inspect, summarize, adapt, or copy content from an external page, call fetch_remote_web_page with the page URL. +- fetch_remote_web_page returns JSON with fields like statusCode, contentType, finalUrl, content, and truncated. +- Only use this for http/https URLs. If the tool returns an error JSON object, explain the issue and ask for another URL. + WORKSPACE RULES: - Projects may define custom rules in .sitebuilder/rules/ folders (Markdown files) - You MUST call get_workspace_rules whenever you start working on a task diff --git a/src/WorkspaceTooling/Facade/WorkspaceToolingFacade.php b/src/WorkspaceTooling/Facade/WorkspaceToolingFacade.php index cb41129a..ba720182 100644 --- a/src/WorkspaceTooling/Facade/WorkspaceToolingFacade.php +++ b/src/WorkspaceTooling/Facade/WorkspaceToolingFacade.php @@ -16,6 +16,12 @@ final class WorkspaceToolingFacade extends BaseWorkspaceToolingFacade implements WorkspaceToolingServiceInterface { + private const string WORKSPACE_MOUNT_POINT = '/workspace'; + private const string CURL_META_MARKER = '__PB_CURL_META__'; + private const int REMOTE_WEB_PAGE_MAX_BYTES = 50_000; + private const int REMOTE_WEB_PAGE_TIMEOUT_SECONDS = 20; + private const int REMOTE_WEB_PAGE_CONNECT_TIMEOUT_SECONDS = 10; + public function __construct( FileOperationsServiceInterface $fileOperationsService, TextOperationsService $textOperationsService, @@ -228,6 +234,134 @@ public function getWorkspaceRules(): string return json_encode($rules, JSON_THROW_ON_ERROR); } + public function fetchRemoteWebPage(string $url): string + { + $normalizedUrl = trim($url); + if (!$this->isAllowedRemoteWebPageUrl($normalizedUrl)) { + return $this->encodeRemoteWebPageError( + 'Invalid URL. Only absolute http/https URLs are supported.', + $normalizedUrl + ); + } + + $workspacePath = $this->executionContext->getWorkspacePath(); + if ($workspacePath === null || $workspacePath === '' || !is_dir($workspacePath)) { + return $this->encodeRemoteWebPageError( + 'Execution context not set. Cannot resolve workspace path for cURL fetch.', + $normalizedUrl + ); + } + + try { + $output = $this->shellOperationsService->runCommand( + self::WORKSPACE_MOUNT_POINT, + $this->buildFetchRemoteWebPageCommand($normalizedUrl) + ); + } catch (Throwable $throwable) { + return $this->encodeRemoteWebPageError( + 'Failed to fetch remote page: ' . $throwable->getMessage(), + $normalizedUrl + ); + } + + $markerPos = strrpos($output, self::CURL_META_MARKER); + if ($markerPos === false) { + return $this->encodeRemoteWebPageError( + 'cURL output did not contain expected metadata.', + $normalizedUrl + ); + } + + $contentEnd = $markerPos; + if ($contentEnd > 0 && $output[$contentEnd - 1] === "\n") { + --$contentEnd; + } + + $content = substr($output, 0, $contentEnd); + $metaRaw = trim(substr($output, $markerPos + strlen(self::CURL_META_MARKER))); + + if (!preg_match('/^(\d{3})\t([^\t]*)\t(\S+)/', $metaRaw, $matches)) { + return $this->encodeRemoteWebPageError( + 'Failed to parse metadata from cURL output.', + $normalizedUrl + ); + } + + $statusCode = (int) $matches[1]; + $contentType = $matches[2]; + $finalUrl = $matches[3]; + + $truncated = false; + if (strlen($content) > self::REMOTE_WEB_PAGE_MAX_BYTES) { + $content = substr($content, 0, self::REMOTE_WEB_PAGE_MAX_BYTES); + $truncated = true; + } + + return $this->encodeJsonSafe([ + 'url' => $normalizedUrl, + 'finalUrl' => $finalUrl, + 'statusCode' => $statusCode, + 'contentType' => $contentType, + 'content' => $content, + 'truncated' => $truncated, + ]); + } + + private function buildFetchRemoteWebPageCommand(string $url): string + { + $escapedUrl = escapeshellarg($url); + $writeOutFormat = escapeshellarg('\n' . self::CURL_META_MARKER . '%{http_code}\t%{content_type}\t%{url_effective}'); + + return sprintf( + 'curl -L -sS --max-time %d --connect-timeout %d --output - --write-out %s %s', + self::REMOTE_WEB_PAGE_TIMEOUT_SECONDS, + self::REMOTE_WEB_PAGE_CONNECT_TIMEOUT_SECONDS, + $writeOutFormat, + $escapedUrl + ); + } + + private function isAllowedRemoteWebPageUrl(string $url): bool + { + $parsed = parse_url($url); + if (!is_array($parsed)) { + return false; + } + + $scheme = $parsed['scheme'] ?? null; + $host = $parsed['host'] ?? null; + if (!is_string($scheme) || !is_string($host)) { + return false; + } + + if (!in_array(strtolower($scheme), ['http', 'https'], true)) { + return false; + } + + return $host !== ''; + } + + private function encodeRemoteWebPageError(string $error, string $url): string + { + return $this->encodeJsonSafe([ + 'error' => $error, + 'url' => $url, + ]); + } + + /** + * @param array $payload + */ + private function encodeJsonSafe(array $payload): string + { + $json = json_encode($payload, JSON_INVALID_UTF8_SUBSTITUTE); + if (is_string($json)) { + return $json; + } + + return '{"error":"Unable to encode JSON response.","url":""}'; + } + public function runBuildInWorkspace(string $workspacePath, string $agentImage): string { return $this->dockerExecutor->run( diff --git a/src/WorkspaceTooling/Facade/WorkspaceToolingServiceInterface.php b/src/WorkspaceTooling/Facade/WorkspaceToolingServiceInterface.php index f2da7bd4..e10afa51 100644 --- a/src/WorkspaceTooling/Facade/WorkspaceToolingServiceInterface.php +++ b/src/WorkspaceTooling/Facade/WorkspaceToolingServiceInterface.php @@ -74,6 +74,13 @@ public function getRemoteAssetInfo(string $url): string; */ public function getWorkspaceRules(): string; + /** + * Fetch the textual content of a remote web page via cURL. + * Returns JSON with keys: url, finalUrl, statusCode, contentType, content, truncated. + * On failure, returns JSON with keys: error, url. Never throws. + */ + public function fetchRemoteWebPage(string $url): string; + /** * Run build (npm run build) in the specified workspace. * diff --git a/tests/Unit/LlmContentEditor/ContentEditorAgentTest.php b/tests/Unit/LlmContentEditor/ContentEditorAgentTest.php index 1240882e..39847015 100644 --- a/tests/Unit/LlmContentEditor/ContentEditorAgentTest.php +++ b/tests/Unit/LlmContentEditor/ContentEditorAgentTest.php @@ -15,6 +15,7 @@ use NeuronAI\Chat\Messages\ToolCallResultMessage; use NeuronAI\Chat\Messages\UserMessage; use NeuronAI\Tools\Tool; +use NeuronAI\Tools\ToolInterface; use PHPUnit\Framework\TestCase; use ReflectionMethod; @@ -183,6 +184,28 @@ public function testAgentUsesProvidedConfig(): void self::assertSame(['Custom output'], $outputInstructions); } + public function testToolsContainFetchRemoteWebPageTool(): void + { + $agent = new ContentEditorAgent( + $this->createMockWorkspaceTooling(), + LlmModelName::defaultForContentEditor(), + 'sk-test-key', + $this->createDefaultAgentConfig() + ); + $ref = new ReflectionMethod(ContentEditorAgent::class, 'tools'); + $ref->setAccessible(true); + + /** @var list $tools */ + $tools = $ref->invoke($agent); + + $toolNames = array_map( + static fn (ToolInterface $tool): string => $tool->getName(), + $tools + ); + + self::assertContains('fetch_remote_web_page', $toolNames); + } + private function createMockWorkspaceTooling(): WorkspaceToolingServiceInterface { return $this->createMock(WorkspaceToolingServiceInterface::class); diff --git a/tests/Unit/LlmContentEditor/ProgressMessageResolverTest.php b/tests/Unit/LlmContentEditor/ProgressMessageResolverTest.php index b3a27c24..37d1b7a0 100644 --- a/tests/Unit/LlmContentEditor/ProgressMessageResolverTest.php +++ b/tests/Unit/LlmContentEditor/ProgressMessageResolverTest.php @@ -103,6 +103,19 @@ public function testToolCallingGetWorkspaceRulesTranslates(): void self::assertSame('Loading workspace rules', $resolver->messageForEvent($event, 'en')); } + public function testToolCallingFetchRemoteWebPageTranslates(): void + { + $translator = $this->createMock(TranslatorInterface::class); + $translator->method('trans') + ->with('fetching_remote_web_page', [], 'progress', 'en') + ->willReturn('Fetching remote web page'); + + $resolver = $this->createResolver($translator); + $event = new AgentEventDto('tool_calling', 'fetch_remote_web_page'); + + self::assertSame('Fetching remote web page', $resolver->messageForEvent($event, 'en')); + } + public function testInferenceStopReturnsNull(): void { $translator = $this->createMock(TranslatorInterface::class); diff --git a/tests/Unit/ProjectMgmt/AgentConfigTemplateTest.php b/tests/Unit/ProjectMgmt/AgentConfigTemplateTest.php index 17493364..0d9fcda4 100644 --- a/tests/Unit/ProjectMgmt/AgentConfigTemplateTest.php +++ b/tests/Unit/ProjectMgmt/AgentConfigTemplateTest.php @@ -45,6 +45,15 @@ public function testDefaultTemplateContainsRemoteContentAssetsSection(): void self::assertStringContainsString('get_remote_asset_info', $template->backgroundInstructions); } + public function testDefaultTemplateContainsRemoteWebPagesSection(): void + { + $template = AgentConfigTemplate::forProjectType(ProjectType::DEFAULT); + + self::assertStringContainsString('REMOTE WEB PAGES', $template->backgroundInstructions); + self::assertStringContainsString('fetch_remote_web_page', $template->backgroundInstructions); + self::assertStringContainsString('http/https', $template->backgroundInstructions); + } + public function testDefaultTemplateContainsWorkspaceRulesSection(): void { $template = AgentConfigTemplate::forProjectType(ProjectType::DEFAULT); diff --git a/tests/Unit/WorkspaceTooling/WorkspaceToolingFacadeTest.php b/tests/Unit/WorkspaceTooling/WorkspaceToolingFacadeTest.php index 29f7f2c1..8e1b84bb 100644 --- a/tests/Unit/WorkspaceTooling/WorkspaceToolingFacadeTest.php +++ b/tests/Unit/WorkspaceTooling/WorkspaceToolingFacadeTest.php @@ -403,17 +403,109 @@ public function testSearchRemoteContentAssetUrlsMatchesFullUrlPathAndDomain(): v self::assertContains('https://uploads.com/bar.png', $decoded); } + public function testFetchRemoteWebPageReturnsParsedJsonForValidUrl(): void + { + $this->executionContext->setContext( + 'workspace-id', + $this->tempDir, + null, + 'project', + 'image' + ); + $shellOps = $this->createMock(ShellOperationsServiceInterface::class); + $shellOps->expects(self::once()) + ->method('runCommand') + ->with( + '/workspace', + self::callback(static fn (string $command): bool => str_contains($command, 'curl -L -sS')) + ) + ->willReturn("Hello\n__PB_CURL_META__200\ttext/html\thttps://example.com/final"); + $facade = $this->createFacadeWithShellOps($shellOps); + + $result = $facade->fetchRemoteWebPage('https://example.com/start'); + + $decoded = json_decode($result, true, 512, JSON_THROW_ON_ERROR); + self::assertIsArray($decoded); + self::assertSame('https://example.com/start', $decoded['url']); + self::assertSame('https://example.com/final', $decoded['finalUrl']); + self::assertSame(200, $decoded['statusCode']); + self::assertSame('text/html', $decoded['contentType']); + self::assertSame('Hello', $decoded['content']); + self::assertFalse($decoded['truncated']); + } + + public function testFetchRemoteWebPageReturnsErrorForInvalidSchemeWithoutShellCall(): void + { + $shellOps = $this->createMock(ShellOperationsServiceInterface::class); + $shellOps->expects(self::never())->method('runCommand'); + $facade = $this->createFacadeWithShellOps($shellOps); + + $result = $facade->fetchRemoteWebPage('ftp://example.com/file.txt'); + + $decoded = json_decode($result, true, 512, JSON_THROW_ON_ERROR); + self::assertIsArray($decoded); + self::assertSame('ftp://example.com/file.txt', $decoded['url']); + self::assertArrayHasKey('error', $decoded); + self::assertIsString($decoded['error']); + self::assertStringContainsString('Invalid URL', $decoded['error']); + } + + public function testFetchRemoteWebPageReturnsErrorWhenCurlOutputCannotBeParsed(): void + { + $this->executionContext->setContext( + 'workspace-id', + $this->tempDir, + null, + 'project', + 'image' + ); + $shellOps = $this->createMock(ShellOperationsServiceInterface::class); + $shellOps->expects(self::once()) + ->method('runCommand') + ->willReturn('curl: (6) Could not resolve host: no-such-host.invalid'); + $facade = $this->createFacadeWithShellOps($shellOps); + + $result = $facade->fetchRemoteWebPage('https://no-such-host.invalid'); + + $decoded = json_decode($result, true, 512, JSON_THROW_ON_ERROR); + self::assertIsArray($decoded); + self::assertSame('https://no-such-host.invalid', $decoded['url']); + self::assertArrayHasKey('error', $decoded); + self::assertIsString($decoded['error']); + self::assertStringContainsString('metadata', strtolower($decoded['error'])); + } + + public function testFetchRemoteWebPageTruncatesLargeResponses(): void + { + $this->executionContext->setContext( + 'workspace-id', + $this->tempDir, + null, + 'project', + 'image' + ); + $largeBody = str_repeat('A', 60_000); + $shellOps = $this->createMock(ShellOperationsServiceInterface::class); + $shellOps->expects(self::once()) + ->method('runCommand') + ->willReturn($largeBody . "\n__PB_CURL_META__200\ttext/plain\thttps://example.com/long"); + $facade = $this->createFacadeWithShellOps($shellOps); + + $result = $facade->fetchRemoteWebPage('https://example.com/long'); + + $decoded = json_decode($result, true, 512, JSON_THROW_ON_ERROR); + self::assertIsArray($decoded); + self::assertTrue($decoded['truncated']); + self::assertIsString($decoded['content']); + self::assertSame(50_000, strlen($decoded['content'])); + } + private function createFacade(): WorkspaceToolingFacade { - $fileOps = new FileOperationsService(); - $textOps = new TextOperationsService($fileOps); - $shellOps = $this->createMock(ShellOperationsServiceInterface::class); $remoteContentAssetsFacade = $this->createMock(RemoteContentAssetsFacadeInterface::class); - // DockerExecutor is final, so we create a real instance with dummy paths - // The tests don't call runBuildInWorkspace, so this is safe - $dockerExecutor = new DockerExecutor('/tmp', '/tmp'); + $shellOps = $this->createMock(ShellOperationsServiceInterface::class); - return new WorkspaceToolingFacade($fileOps, $textOps, $shellOps, $this->executionContext, $remoteContentAssetsFacade, $dockerExecutor); + return $this->createFacadeWithDependencies($shellOps, $remoteContentAssetsFacade); } public function testGetRemoteAssetInfoReturnsErrorJsonWhenFacadeReturnsNull(): void @@ -454,14 +546,36 @@ public function testGetRemoteAssetInfoReturnsJsonWhenFacadeReturnsDto(): void private function createFacadeWithRemoteContentAssets(RemoteContentAssetsFacadeInterface $remoteContentAssetsFacade): WorkspaceToolingFacade { - $fileOps = new FileOperationsService(); - $textOps = new TextOperationsService($fileOps); $shellOps = $this->createMock(ShellOperationsServiceInterface::class); + + return $this->createFacadeWithDependencies($shellOps, $remoteContentAssetsFacade); + } + + private function createFacadeWithShellOps(ShellOperationsServiceInterface $shellOps): WorkspaceToolingFacade + { + $remoteContentAssetsFacade = $this->createMock(RemoteContentAssetsFacadeInterface::class); + + return $this->createFacadeWithDependencies($shellOps, $remoteContentAssetsFacade); + } + + private function createFacadeWithDependencies( + ShellOperationsServiceInterface $shellOps, + RemoteContentAssetsFacadeInterface $remoteContentAssetsFacade + ): WorkspaceToolingFacade { + $fileOps = new FileOperationsService(); + $textOps = new TextOperationsService($fileOps); // DockerExecutor is final, so we create a real instance with dummy paths // The tests don't call runBuildInWorkspace, so this is safe $dockerExecutor = new DockerExecutor('/tmp', '/tmp'); - return new WorkspaceToolingFacade($fileOps, $textOps, $shellOps, $this->executionContext, $remoteContentAssetsFacade, $dockerExecutor); + return new WorkspaceToolingFacade( + $fileOps, + $textOps, + $shellOps, + $this->executionContext, + $remoteContentAssetsFacade, + $dockerExecutor + ); } private function removeDirectory(string $dir): void diff --git a/translations/progress.de.yaml b/translations/progress.de.yaml index 3df5efd9..84dd8ceb 100644 --- a/translations/progress.de.yaml +++ b/translations/progress.de.yaml @@ -21,6 +21,7 @@ running_build: "Führe Build aus" fetching_remote_asset_urls: "Rufe Remote-Asset-URLs ab" searching_remote_assets: "Suche Remote-Assets" getting_remote_asset_info: "Hole Remote-Asset-Infos" +fetching_remote_web_page: "Rufe Remote-Webseite ab" suggesting_commit_message: "Schlage Commit-Nachricht vor" getting_preview_url: "Hole Vorschau-URL für %label%" getting_preview_url_only: "Vorschau-URL holen" diff --git a/translations/progress.en.yaml b/translations/progress.en.yaml index e3dd7fdf..8e08debe 100644 --- a/translations/progress.en.yaml +++ b/translations/progress.en.yaml @@ -21,6 +21,7 @@ running_build: "Running build" fetching_remote_asset_urls: "Fetching remote asset URLs" searching_remote_assets: "Searching remote assets" getting_remote_asset_info: "Getting remote asset info" +fetching_remote_web_page: "Fetching remote web page" suggesting_commit_message: "Suggesting commit message" getting_preview_url: "Getting preview URL for %label%" getting_preview_url_only: "Getting preview URL"