diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ee73467..6cad2238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,10 @@ -## [1.38.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.38.0...v1.38.1) (2025-02-15) +## [1.39.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.38.1-beta.1...v1.39.0-beta.1) (2025-02-17) -### Bug Fixes - -* filter links ([04b9197](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/04b91972e88b69b722454d54c8635dfb49b38b44)) - - -### Test - -* Add coverage improvement test for tests/test_scrape_do.py ([4ce6d1b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4ce6d1b94306d0ae94a74748726468a5132b7969)) - +### Features -### CI +* add the new handling exception ([5c0bc46](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c0bc46c6322ea07efa31d95819d7da47462f981)) -* **release:** 1.38.1-beta.1 [skip ci] ([83be82a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/83be82a11e83eb2be60a945deac361c46526c785)) ## [1.38.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.38.0...v1.38.1-beta.1) (2025-02-13) diff --git a/docs/assets/api-banner.png b/docs/assets/api-banner.png index 06de517d..15b79c2f 100644 Binary files a/docs/assets/api-banner.png and b/docs/assets/api-banner.png differ diff --git a/pyproject.toml b/pyproject.toml index 07229ce8..3ce4c882 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.38.1" +version = "1.39.0b1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 63eedd4d..79d3d46f 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -80,6 +80,7 @@ "llama3.2": 128000, "llama3.2:1b": 128000, "llama3.2:3b": 128000, + "llama3.3": 128000, "llama3.3:70b": 128000, "scrapegraph": 8192, "mistral-small": 128000, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 58ec7772..f7f20cf8 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -3,6 +3,7 @@ """ import time +import json from typing import List, Optional from langchain.prompts import PromptTemplate @@ -120,7 +121,11 @@ def execute(self, state: dict) -> dict: else: if not isinstance(self.llm_model, ChatBedrock): output_parser = JsonOutputParser() - format_instructions = output_parser.get_format_instructions() + format_instructions = ( + "You must respond with a JSON object. Your response should be formatted as a valid JSON " + "with a 'content' field containing your analysis. For example:\n" + '{"content": "your analysis here"}' + ) else: output_parser = None format_instructions = "" @@ -131,13 +136,25 @@ def execute(self, state: dict) -> dict: and not self.script_creator or self.is_md_scraper ): - template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD - template_chunks_prompt = TEMPLATE_CHUNKS_MD - template_merge_prompt = TEMPLATE_MERGE_MD + template_no_chunks_prompt = ( + TEMPLATE_NO_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions + ) + template_chunks_prompt = ( + TEMPLATE_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions + ) + template_merge_prompt = ( + TEMPLATE_MERGE_MD + "\n\nIMPORTANT: " + format_instructions + ) else: - template_no_chunks_prompt = TEMPLATE_NO_CHUNKS - template_chunks_prompt = TEMPLATE_CHUNKS - template_merge_prompt = TEMPLATE_MERGE + template_no_chunks_prompt = ( + TEMPLATE_NO_CHUNKS + "\n\nIMPORTANT: " + format_instructions + ) + template_chunks_prompt = ( + TEMPLATE_CHUNKS + "\n\nIMPORTANT: " + format_instructions + ) + template_merge_prompt = ( + TEMPLATE_MERGE + "\n\nIMPORTANT: " + format_instructions + ) if self.additional_info is not None: template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt @@ -161,8 +178,9 @@ def execute(self, state: dict) -> dict: answer = self.invoke_with_timeout( chain, {"question": user_prompt}, self.timeout ) - except Timeout: - state.update({self.output[0]: {"error": "Response timeout exceeded"}}) + except (Timeout, json.JSONDecodeError) as e: + error_msg = "Response timeout exceeded" if isinstance(e, Timeout) else "Invalid JSON response format" + state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state state.update({self.output[0]: answer}) @@ -191,14 +209,9 @@ def execute(self, state: dict) -> dict: batch_results = self.invoke_with_timeout( async_runner, {"question": user_prompt}, self.timeout ) - except Timeout: - state.update( - { - self.output[0]: { - "error": "Response timeout exceeded during chunk processing" - } - } - ) + except (Timeout, json.JSONDecodeError) as e: + error_msg = "Response timeout exceeded during chunk processing" if isinstance(e, Timeout) else "Invalid JSON response format in chunk processing" + state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state merge_prompt = PromptTemplate( @@ -216,10 +229,9 @@ def execute(self, state: dict) -> dict: {"context": batch_results, "question": user_prompt}, self.timeout, ) - except Timeout: - state.update( - {self.output[0]: {"error": "Response timeout exceeded during merge"}} - ) + except (Timeout, json.JSONDecodeError) as e: + error_msg = "Response timeout exceeded during merge" if isinstance(e, Timeout) else "Invalid JSON response format during merge" + state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state state.update({self.output[0]: answer})