|
2 | 2 |
|
3 | 3 | [](https://opensource.org/licenses/MIT)
|
4 | 4 | [](https://pypi.org/project/langchain-scrapegraph/)
|
5 |
| -[](https://scrapegraphai.com/documentation) |
| 5 | +[](https://docs.scrapegraphai.com/integrations/langchain) |
6 | 6 |
|
7 | 7 | Supercharge your LangChain agents with AI-powered web scraping capabilities. LangChain-ScrapeGraph provides a seamless integration between [LangChain](https://github.com/langchain-ai/langchain) and [ScrapeGraph AI](https://scrapegraphai.com), enabling your agents to extract structured data from websites using natural language.
|
8 | 8 |
|
@@ -58,98 +58,76 @@ result = tool.invoke({
|
58 | 58 | print(result)
|
59 | 59 | ```
|
60 | 60 |
|
61 |
| -<details> |
62 |
| -<summary>🔍 Using Output Schemas with SmartscraperTool</summary> |
63 |
| - |
64 |
| -You can define the structure of the output using Pydantic models: |
| 61 | +### 🌐 SearchscraperTool |
| 62 | +Search and extract structured information from the web using natural language prompts. |
65 | 63 |
|
66 | 64 | ```python
|
67 |
| -from typing import List |
68 |
| -from pydantic import BaseModel, Field |
69 |
| -from langchain_scrapegraph.tools import SmartScraperTool |
| 65 | +from langchain_scrapegraph.tools import SearchScraperTool |
70 | 66 |
|
71 |
| -class WebsiteInfo(BaseModel): |
72 |
| - title: str = Field(description="The main title of the webpage") |
73 |
| - description: str = Field(description="The main description or first paragraph") |
74 |
| - urls: List[str] = Field(description="The URLs inside the webpage") |
75 |
| - |
76 |
| -# Initialize with schema |
77 |
| -tool = SmartScraperTool(llm_output_schema=WebsiteInfo) |
| 67 | +# Initialize the tool (uses SGAI_API_KEY from environment) |
| 68 | +tool = SearchScraperTool() |
78 | 69 |
|
79 |
| -# The output will conform to the WebsiteInfo schema |
| 70 | +# Search and extract information using natural language |
80 | 71 | result = tool.invoke({
|
81 |
| - "website_url": "https://www.example.com", |
82 |
| - "user_prompt": "Extract the website information" |
| 72 | + "user_prompt": "What are the key features and pricing of ChatGPT Plus?" |
83 | 73 | })
|
84 | 74 |
|
85 | 75 | print(result)
|
86 | 76 | # {
|
87 |
| -# "title": "Example Domain", |
88 |
| -# "description": "This domain is for use in illustrative examples...", |
89 |
| -# "urls": ["https://www.iana.org/domains/example"] |
| 77 | +# "product": { |
| 78 | +# "name": "ChatGPT Plus", |
| 79 | +# "description": "Premium version of ChatGPT..." |
| 80 | +# }, |
| 81 | +# "features": [...], |
| 82 | +# "pricing": {...}, |
| 83 | +# "reference_urls": [ |
| 84 | +# "https://openai.com/chatgpt", |
| 85 | +# ... |
| 86 | +# ] |
90 | 87 | # }
|
91 | 88 | ```
|
92 |
| -</details> |
93 |
| - |
94 |
| -### 💻 LocalscraperTool |
95 |
| -Extract information from HTML content using AI. |
96 |
| - |
97 |
| -```python |
98 |
| -from langchain_scrapegraph.tools import LocalScraperTool |
99 |
| - |
100 |
| -tool = LocalScraperTool() |
101 |
| -result = tool.invoke({ |
102 |
| - "user_prompt": "Extract all contact information", |
103 |
| - "website_html": "<html>...</html>" |
104 |
| -}) |
105 |
| - |
106 |
| -print(result) |
107 |
| -``` |
108 | 89 |
|
109 | 90 | <details>
|
110 |
| -<summary>🔍 Using Output Schemas with LocalscraperTool</summary> |
| 91 | +<summary>🔍 Using Output Schemas with SearchscraperTool</summary> |
111 | 92 |
|
112 | 93 | You can define the structure of the output using Pydantic models:
|
113 | 94 |
|
114 | 95 | ```python
|
115 |
| -from typing import Optional |
| 96 | +from typing import List, Dict |
116 | 97 | from pydantic import BaseModel, Field
|
117 |
| -from langchain_scrapegraph.tools import LocalScraperTool |
| 98 | +from langchain_scrapegraph.tools import SearchScraperTool |
118 | 99 |
|
119 |
| -class CompanyInfo(BaseModel): |
120 |
| - name: str = Field(description="The company name") |
121 |
| - description: str = Field(description="The company description") |
122 |
| - email: Optional[str] = Field(description="Contact email if available") |
123 |
| - phone: Optional[str] = Field(description="Contact phone if available") |
| 100 | +class ProductInfo(BaseModel): |
| 101 | + name: str = Field(description="Product name") |
| 102 | + features: List[str] = Field(description="List of product features") |
| 103 | + pricing: Dict[str, Any] = Field(description="Pricing information") |
| 104 | + reference_urls: List[str] = Field(description="Source URLs for the information") |
124 | 105 |
|
125 | 106 | # Initialize with schema
|
126 |
| -tool = LocalScraperTool(llm_output_schema=CompanyInfo) |
127 |
| - |
128 |
| -html_content = """ |
129 |
| -<html> |
130 |
| - <body> |
131 |
| - <h1>TechCorp Solutions</h1> |
132 |
| - <p>We are a leading AI technology company.</p> |
133 |
| - <div class="contact"> |
134 |
| - |
135 |
| - <p>Phone: (555) 123-4567</p> |
136 |
| - </div> |
137 |
| - </body> |
138 |
| -</html> |
139 |
| -""" |
140 |
| - |
141 |
| -# The output will conform to the CompanyInfo schema |
| 107 | +tool = SearchScraperTool(llm_output_schema=ProductInfo) |
| 108 | + |
| 109 | +# The output will conform to the ProductInfo schema |
142 | 110 | result = tool.invoke({
|
143 |
| - "website_html": html_content, |
144 |
| - "user_prompt": "Extract the company information" |
| 111 | + "user_prompt": "What are the key features and pricing of ChatGPT Plus?" |
145 | 112 | })
|
146 | 113 |
|
147 | 114 | print(result)
|
148 | 115 | # {
|
149 |
| -# "name": "TechCorp Solutions", |
150 |
| -# "description": "We are a leading AI technology company.", |
151 |
| - |
152 |
| -# "phone": "(555) 123-4567" |
| 116 | +# "name": "ChatGPT Plus", |
| 117 | +# "features": [ |
| 118 | +# "GPT-4 access", |
| 119 | +# "Faster response speed", |
| 120 | +# ... |
| 121 | +# ], |
| 122 | +# "pricing": { |
| 123 | +# "amount": 20, |
| 124 | +# "currency": "USD", |
| 125 | +# "period": "monthly" |
| 126 | +# }, |
| 127 | +# "reference_urls": [ |
| 128 | +# "https://openai.com/chatgpt", |
| 129 | +# ... |
| 130 | +# ] |
153 | 131 | # }
|
154 | 132 | ```
|
155 | 133 | </details>
|
|
0 commit comments