Skip to content

Pre/beta #943

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
## [1.41.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.1...v1.41.0-beta.1) (2025-03-07)


### Features

* add CLoD integration ([4e0e785](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4e0e78582c3a75e64c5eba26ce40b5ffbf05d58e))


### Test

* Add coverage improvement test for tests/test_generate_answer_node.py ([6769c0d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6769c0d43ab72f1c8b520dd28d19f747b22f9b7c))

## [1.40.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.0...v1.40.1) (2025-02-27)


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "scrapegraphai"

version = "1.40.1"
version = "1.41.0b1"



Expand Down
7 changes: 6 additions & 1 deletion scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pydantic import BaseModel

from ..helpers import models_tokens
from ..models import DeepSeek, OneApi
from ..models import CLoD, DeepSeek, OneApi
from ..utils.logging import set_verbosity_info, set_verbosity_warning


Expand Down Expand Up @@ -164,6 +164,7 @@ def _create_llm(self, llm_config: dict) -> object:
"deepseek",
"ernie",
"fireworks",
"clod",
"togetherai",
}

Expand Down Expand Up @@ -218,6 +219,7 @@ def _create_llm(self, llm_config: dict) -> object:
"ernie",
"deepseek",
"togetherai",
"clod",
}:
if llm_params["model_provider"] == "bedrock":
llm_params["model_kwargs"] = {
Expand All @@ -229,6 +231,9 @@ def _create_llm(self, llm_config: dict) -> object:
else:
model_provider = llm_params.pop("model_provider")

if model_provider == "clod":
return CLoD(**llm_params)

if model_provider == "deepseek":
return DeepSeek(**llm_params)

Expand Down
32 changes: 32 additions & 0 deletions scrapegraphai/helpers/models_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,5 +261,37 @@
"mixtral-moe-8x22B-instruct": 65536,
"mixtral-moe-8x7B-instruct": 65536,
},
"clod": {
"open-mistral-7b": 32000,
"Llama-3.1-70b": 128000,
"Llama-3.1-405b": 128000,
"Llama-3.3-70b": 128000,
"Llama-3.1-8b": 128000,
"gpt-4o": 128000,
"gpt-4o-mini": 128000,
"gpt-4-turbo": 128000,
"claude-3-opus-latest": 200000,
"gemini-1.5-flash-8b": 128000,
"gemini-1.5-flash": 128000,
"open-mixtral-8x7b": 32000,
"open-mixtral-8x22b": 64000,
"claude-3-5-sonnet-latest": 200000,
"claude-3-haiku-20240307": 200000,
"Qwen-2.5-Coder-32B": 32000,
"Deepseek-R1-Distill-Llama-70B": 131072,
"Deepseek-V3": 128000,
"Qwen-2-VL-72B": 128000,
"Deepseek-R1-Distill-Qwen-14B": 131072,
"Deepseek-R1-Distill-Qwen-1.5B": 131072,
"Deepseek-R1": 128000,
"Deepseek-Llm-Chat-67B": 4096,
"Qwen-2.5-7B": 132072,
"Qwen-2.5-72B": 132072,
"Qwen-2-72B": 128000,
"o1": 200000,
"gemini-2.0-flash-exp": 1000000,
"grok-beta": 128000,
"grok-2-latest": 128000,
},
"togetherai": {"Meta-Llama-3.1-70B-Instruct-Turbo": 128000},
}
8 changes: 2 additions & 6 deletions scrapegraphai/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@
This module contains the model definitions used in the ScrapeGraphAI application.
"""

from .clod import CLoD
from .deepseek import DeepSeek
from .oneapi import OneApi
from .openai_itt import OpenAIImageToText
from .openai_tts import OpenAITextToSpeech

__all__ = [
"DeepSeek",
"OneApi",
"OpenAIImageToText",
"OpenAITextToSpeech",
]
__all__ = ["DeepSeek", "OneApi", "OpenAIImageToText", "OpenAITextToSpeech", "CLoD"]
23 changes: 23 additions & 0 deletions scrapegraphai/models/clod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
CLōD Module
"""

from langchain_openai import ChatOpenAI


class CLoD(ChatOpenAI):
"""
A wrapper for the ChatOpenAI class (CLōD uses an OpenAI-like API) that
provides default configuration and could be extended with additional methods
if needed.

Args:
llm_config (dict): Configuration parameters for the language model.
"""

def __init__(self, **llm_config):
if "api_key" in llm_config:
llm_config["openai_api_key"] = llm_config.pop("api_key")
llm_config["openai_api_base"] = "https://api.clod.io/v1"

super().__init__(**llm_config)
1 change: 1 addition & 0 deletions tests/graphs/.env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
OPENAI_API_KEY="YOUR OPENAI API KEY"
FIREWORKS_APIKEY="YOOUR FIREWORK KEY"
CLOD_API_KEY="YOUR CLOD API KEY"
35 changes: 34 additions & 1 deletion tests/graphs/abstract_graph_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,37 @@ def test_set_common_params(self):
test_params = {"param1": "value1", "param2": "value2"}
graph.set_common_params(test_params)

# Assert that update_config was called on each node with the correct parameters
# Assert that update_config was called on each node with the correct parameters

def test_get_state(self):
"""Test that get_state returns the correct final state with or without a provided key, and raises KeyError for missing keys."""
graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
# Set a dummy final state
graph.final_state = {"answer": "42", "other": "value"}
# Test without a key returns the entire final_state
state = graph.get_state()
assert state == {"answer": "42", "other": "value"}
# Test with a valid key returns the specific value
answer = graph.get_state("answer")
assert answer == "42"
# Test that a missing key raises a KeyError
with pytest.raises(KeyError):
_ = graph.get_state("nonexistent")

def test_append_node(self):
"""Test that append_node correctly delegates to the graph's append_node method."""
graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
# Replace the graph object with a mock that has append_node
mock_graph = Mock()
graph.graph = mock_graph
dummy_node = Mock()
graph.append_node(dummy_node)
mock_graph.append_node.assert_called_once_with(dummy_node)

def test_get_execution_info(self):
"""Test that get_execution_info returns the execution info stored in the graph."""
graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
dummy_info = {"execution": "info", "status": "ok"}
graph.execution_info = dummy_info
info = graph.get_execution_info()
assert info == dummy_info
55 changes: 55 additions & 0 deletions tests/graphs/smart_scraper_clod_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Module for testing the smart scraper class
"""

import os

import pytest
from dotenv import load_dotenv

from scrapegraphai.graphs import SmartScraperGraph

load_dotenv()


@pytest.fixture
def graph_config():
"""Configuration of the graph"""
clod_api_key = os.getenv("CLOD_API_KEY")
return {
"llm": {
"api_key": clod_api_key,
"model": "clod/claude-3-5-sonnet-latest",
},
"verbose": True,
"headless": False,
}


def test_scraping_pipeline(graph_config):
"""Start of the scraping pipeline"""
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description.",
source="https://perinim.github.io/projects/",
config=graph_config,
)

result = smart_scraper_graph.run()

assert result is not None
assert isinstance(result, dict)


def test_get_execution_info(graph_config):
"""Get the execution info"""
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description.",
source="https://perinim.github.io/projects/",
config=graph_config,
)

smart_scraper_graph.run()

graph_exec_info = smart_scraper_graph.get_execution_info()

assert graph_exec_info is not None
148 changes: 148 additions & 0 deletions tests/test_models_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import pytest
from scrapegraphai.helpers.models_tokens import models_tokens

class TestModelsTokens:
"""Test suite for verifying the models_tokens dictionary content and structure."""

def test_openai_tokens(self):
"""Test that the 'openai' provider exists and its tokens are valid positive integers."""
openai_models = models_tokens.get("openai")
assert openai_models is not None, "'openai' key should be present in models_tokens"
for model, token in openai_models.items():
assert isinstance(model, str), "Model name should be a string"
assert isinstance(token, int), "Token limit should be an integer"
assert token > 0, "Token limit should be positive"

def test_azure_openai_tokens(self):
"""Test that the 'azure_openai' provider exists and its tokens are valid."""
azure_models = models_tokens.get("azure_openai")
assert azure_models is not None, "'azure_openai' key should be present"
for model, token in azure_models.items():
assert isinstance(model, str), "Model name should be a string"
assert isinstance(token, int), "Token limit should be an integer"

def test_google_providers(self):
"""Test that Google provider dictionaries ('google_genai' and 'google_vertexai') contain expected entries."""
google_genai = models_tokens.get("google_genai")
google_vertexai = models_tokens.get("google_vertexai")
assert google_genai is not None, "'google_genai' key should be present"
assert google_vertexai is not None, "'google_vertexai' key should be present"
# Check a specific key from google_genai
assert "gemini-pro" in google_genai, "'gemini-pro' should be in google_genai models"
# Validate token values types
for provider in [google_genai, google_vertexai]:
for token in provider.values():
assert isinstance(token, int), "Token limit must be an integer"

def test_non_existent_provider(self):
"""Test that a non-existent provider returns None."""
assert models_tokens.get("non_existent") is None, "Non-existent provider should return None"

def test_total_model_keys(self):
"""Test that the total number of models across all providers is above an expected count."""
total_keys = sum(len(details) for details in models_tokens.values())
assert total_keys > 20, "Expected more than 20 total model tokens defined"

def test_specific_token_value(self):
"""Test specific expected token value for a known model."""
openai = models_tokens.get("openai")
# Verify that the token limit for "gpt-4" is 8192 as defined
assert openai.get("gpt-4") == 8192, "Expected token limit for gpt-4 to be 8192"

def test_non_empty_model_keys(self):
"""Ensure that model token names are non-empty strings."""
for provider, model_dict in models_tokens.items():
for model in model_dict.keys():
assert model != "", f"Model name in provider '{provider}' should not be empty."

def test_token_limits_range(self):
"""Test that token limits for all models fall within a plausible range (e.g., 1 to 300000)."""
for provider, model_dict in models_tokens.items():
for model, token in model_dict.items():
assert 1 <= token <= 1100000, f"Token limit for {model} in provider {provider} is out of plausible range."
def test_provider_structure(self):
"""Test that every provider in models_tokens has a dictionary as its value."""
for provider, models in models_tokens.items():
assert isinstance(models, dict), f"Provider {provider} should map to a dictionary, got {type(models).__name__}"

def test_non_empty_provider(self):
"""Test that each provider dictionary is not empty."""
for provider, models in models_tokens.items():
assert len(models) > 0, f"Provider {provider} should contain at least one model."

def test_specific_model_token_values(self):
"""Test specific expected token values for selected models from various providers."""
# Verify a token for a selected model from the 'openai' provider
openai = models_tokens.get("openai")
assert openai.get("gpt-3.5-turbo-0125") == 16385, "Expected token limit for gpt-3.5-turbo-0125 in openai to be 16385"

# Verify a token for a selected model from the 'azure_openai' provider
azure = models_tokens.get("azure_openai")
assert azure.get("gpt-3.5") == 4096, "Expected token limit for gpt-3.5 in azure_openai to be 4096"

# Verify a token for a selected model from the 'anthropic' provider
anthropic = models_tokens.get("anthropic")
assert anthropic.get("claude_instant") == 100000, "Expected token limit for claude_instant in anthropic to be 100000"

def test_providers_count(self):
"""Test that the total number of providers is as expected (at least 15)."""
assert len(models_tokens) >= 15, "Expected at least 15 providers in models_tokens"

def test_non_existent_model(self):
"""Test that a non-existent model within a valid provider returns None."""
openai = models_tokens.get("openai")
assert openai.get("non_existent_model") is None, "Non-existent model should return None from a valid provider."
def test_no_whitespace_in_model_names(self):
"""Test that model names do not contain leading or trailing whitespace."""
for provider, model_dict in models_tokens.items():
for model in model_dict.keys():
# Assert that stripping whitespace does not change the model name
assert model == model.strip(), f"Model name '{model}' in provider '{provider}' contains leading or trailing whitespace."

def test_specific_models_additional(self):
"""Test specific token values for additional models across various providers."""
# Check some models in the 'ollama' provider
ollama = models_tokens.get("ollama")
assert ollama.get("llama2") == 4096, "Expected token limit for 'llama2' in ollama to be 4096"
assert ollama.get("llama2:70b") == 4096, "Expected token limit for 'llama2:70b' in ollama to be 4096"

# Check a specific model from the 'mistralai' provider
mistralai = models_tokens.get("mistralai")
assert mistralai.get("open-codestral-mamba") == 256000, "Expected token limit for 'open-codestral-mamba' in mistralai to be 256000"

# Check a specific model from the 'deepseek' provider
deepseek = models_tokens.get("deepseek")
assert deepseek.get("deepseek-chat") == 28672, "Expected token limit for 'deepseek-chat' in deepseek to be 28672"

# Check a model from the 'ernie' provider
ernie = models_tokens.get("ernie")
assert ernie.get("ernie-bot") == 4096, "Expected token limit for 'ernie-bot' in ernie to be 4096"

def test_nvidia_specific(self):
"""Test specific token value for 'meta/codellama-70b' in the nvidia provider."""
nvidia = models_tokens.get("nvidia")
assert nvidia is not None, "'nvidia' provider should exist"
# Verify token for 'meta/codellama-70b' equals 16384 as defined in the nvidia dictionary
assert nvidia.get("meta/codellama-70b") == 16384, "Expected token limit for 'meta/codellama-70b' in nvidia to be 16384"

def test_groq_specific(self):
"""Test specific token value for 'claude-3-haiku-20240307\'' in the groq provider."""
groq = models_tokens.get("groq")
assert groq is not None, "'groq' provider should exist"
# Note: The model name has an embedded apostrophe at the end in its name.
assert groq.get("claude-3-haiku-20240307'") == 8192, "Expected token limit for 'claude-3-haiku-20240307\\'' in groq to be 8192"

def test_togetherai_specific(self):
"""Test specific token value for 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo' in the toghetherai provider."""
togetherai = models_tokens.get("toghetherai")
assert togetherai is not None, "'toghetherai' provider should exist"
expected = 128000
model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
assert togetherai.get(model_name) == expected, f"Expected token limit for '{model_name}' in toghetherai to be {expected}"

def test_ernie_all_values(self):
"""Test that all models in the 'ernie' provider have token values exactly 4096."""
ernie = models_tokens.get("ernie")
assert ernie is not None, "'ernie' provider should exist"
for model, token in ernie.items():
assert token == 4096, f"Expected token limit for '{model}' in ernie to be 4096, got {token}"
Loading