-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from ScrapeGraphAI/pre/beta
Pre/beta
- Loading branch information
Showing
7 changed files
with
247 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,39 @@ result = tool.invoke({ | |
print(result) | ||
``` | ||
|
||
<details> | ||
<summary>🔍 Using Output Schemas with SmartscraperTool</summary> | ||
|
||
You can define the structure of the output using Pydantic models: | ||
|
||
```python | ||
from typing import List | ||
from pydantic import BaseModel, Field | ||
from langchain_scrapegraph.tools import SmartscraperTool | ||
|
||
class WebsiteInfo(BaseModel): | ||
title: str = Field(description="The main title of the webpage") | ||
description: str = Field(description="The main description or first paragraph") | ||
urls: List[str] = Field(description="The URLs inside the webpage") | ||
|
||
# Initialize with schema | ||
tool = SmartscraperTool(llm_output_schema=WebsiteInfo) | ||
|
||
# The output will conform to the WebsiteInfo schema | ||
result = tool.invoke({ | ||
"website_url": "https://www.example.com", | ||
"user_prompt": "Extract the website information" | ||
}) | ||
|
||
print(result) | ||
# { | ||
# "title": "Example Domain", | ||
# "description": "This domain is for use in illustrative examples...", | ||
# "urls": ["https://www.iana.org/domains/example"] | ||
# } | ||
``` | ||
</details> | ||
|
||
### 💻 LocalscraperTool | ||
Extract information from HTML content using AI. | ||
|
||
|
@@ -73,6 +106,54 @@ result = tool.invoke({ | |
print(result) | ||
``` | ||
|
||
<details> | ||
<summary>🔍 Using Output Schemas with LocalscraperTool</summary> | ||
|
||
You can define the structure of the output using Pydantic models: | ||
|
||
```python | ||
from typing import Optional | ||
from pydantic import BaseModel, Field | ||
from langchain_scrapegraph.tools import LocalscraperTool | ||
|
||
class CompanyInfo(BaseModel): | ||
name: str = Field(description="The company name") | ||
description: str = Field(description="The company description") | ||
email: Optional[str] = Field(description="Contact email if available") | ||
phone: Optional[str] = Field(description="Contact phone if available") | ||
|
||
# Initialize with schema | ||
tool = LocalscraperTool(llm_output_schema=CompanyInfo) | ||
|
||
html_content = """ | ||
<html> | ||
<body> | ||
<h1>TechCorp Solutions</h1> | ||
<p>We are a leading AI technology company.</p> | ||
<div class="contact"> | ||
<p>Email: [email protected]</p> | ||
<p>Phone: (555) 123-4567</p> | ||
</div> | ||
</body> | ||
</html> | ||
""" | ||
|
||
# The output will conform to the CompanyInfo schema | ||
result = tool.invoke({ | ||
"website_html": html_content, | ||
"user_prompt": "Extract the company information" | ||
}) | ||
|
||
print(result) | ||
# { | ||
# "name": "TechCorp Solutions", | ||
# "description": "We are a leading AI technology company.", | ||
# "email": "[email protected]", | ||
# "phone": "(555) 123-4567" | ||
# } | ||
``` | ||
</details> | ||
|
||
## 🌟 Key Features | ||
|
||
- 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from typing import List | ||
|
||
from pydantic import BaseModel, Field | ||
from scrapegraph_py.logger import sgai_logger | ||
|
||
from langchain_scrapegraph.tools import LocalScraperTool | ||
|
||
|
||
class WebsiteInfo(BaseModel): | ||
title: str = Field(description="The main title of the webpage") | ||
description: str = Field(description="The main description or first paragraph") | ||
urls: List[str] = Field(description="The URLs inside the webpage") | ||
|
||
|
||
sgai_logger.set_logging(level="INFO") | ||
|
||
# Initialize with Pydantic model class | ||
tool = LocalScraperTool(llm_output_schema=WebsiteInfo) | ||
|
||
# Example website and prompt | ||
html_content = """ | ||
<html> | ||
<body> | ||
<h1>Company Name</h1> | ||
<p>We are a technology company focused on AI solutions.</p> | ||
<div class="contact"> | ||
<p>Email: [email protected]</p> | ||
<p>Phone: (555) 123-4567</p> | ||
</div> | ||
</body> | ||
</html> | ||
""" | ||
user_prompt = "Make a summary of the webpage and extract the email and phone number" | ||
|
||
# Use the tool | ||
result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt}) | ||
|
||
print(result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from typing import List | ||
|
||
from pydantic import BaseModel, Field | ||
from scrapegraph_py.logger import sgai_logger | ||
|
||
from langchain_scrapegraph.tools import SmartScraperTool | ||
|
||
|
||
class WebsiteInfo(BaseModel): | ||
title: str = Field(description="The main title of the webpage") | ||
description: str = Field(description="The main description or first paragraph") | ||
urls: List[str] = Field(description="The URLs inside the webpage") | ||
|
||
|
||
sgai_logger.set_logging(level="INFO") | ||
|
||
# Initialize with Pydantic model class | ||
tool = SmartScraperTool(llm_output_schema=WebsiteInfo) | ||
|
||
# Example website and prompt | ||
website_url = "https://www.example.com" | ||
user_prompt = "Extract info about the website" | ||
|
||
# Use the tool - output will conform to WebsiteInfo schema | ||
result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt}) | ||
print(result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,8 @@ class LocalScraperTool(BaseTool): | |
Key init args: | ||
api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. | ||
client: Optional pre-configured ScrapeGraph client instance. | ||
llm_output_schema: Optional Pydantic model or dictionary schema to structure the output. | ||
If provided, the tool will ensure the output conforms to this schema. | ||
Instantiate: | ||
.. code-block:: python | ||
|
@@ -49,6 +51,16 @@ class LocalScraperTool(BaseTool): | |
# Or provide API key directly | ||
tool = LocalScraperTool(api_key="your-api-key") | ||
# Optionally, you can provide an output schema: | ||
from pydantic import BaseModel, Field | ||
class CompanyInfo(BaseModel): | ||
name: str = Field(description="Company name") | ||
description: str = Field(description="Company description") | ||
email: str = Field(description="Contact email") | ||
tool_with_schema = LocalScraperTool(llm_output_schema=CompanyInfo) | ||
Use the tool: | ||
.. code-block:: python | ||
|
@@ -71,21 +83,21 @@ class LocalScraperTool(BaseTool): | |
}) | ||
print(result) | ||
# Without schema: | ||
# { | ||
# "description": "We are a technology company focused on AI solutions", | ||
# "contact": { | ||
# "email": "[email protected]", | ||
# "phone": "(555) 123-4567" | ||
# } | ||
# } | ||
Async usage: | ||
.. code-block:: python | ||
result = await tool.ainvoke({ | ||
"user_prompt": "Extract contact information", | ||
"website_html": html_content | ||
}) | ||
# | ||
# With CompanyInfo schema: | ||
# { | ||
# "name": "Company Name", | ||
# "description": "We are a technology company focused on AI solutions", | ||
# "email": "[email protected]" | ||
# } | ||
""" | ||
|
||
name: str = "LocalScraper" | ||
|
@@ -96,6 +108,7 @@ class LocalScraperTool(BaseTool): | |
return_direct: bool = True | ||
client: Optional[Client] = None | ||
api_key: str | ||
llm_output_schema: Optional[Type[BaseModel]] = None | ||
|
||
@model_validator(mode="before") | ||
@classmethod | ||
|
@@ -117,10 +130,23 @@ def _run( | |
"""Use the tool to extract data from a website.""" | ||
if not self.client: | ||
raise ValueError("Client not initialized") | ||
response = self.client.localscraper( | ||
website_html=website_html, | ||
user_prompt=user_prompt, | ||
) | ||
|
||
if self.llm_output_schema is None: | ||
response = self.client.localscraper( | ||
website_html=website_html, | ||
user_prompt=user_prompt, | ||
) | ||
elif isinstance(self.llm_output_schema, type) and issubclass( | ||
self.llm_output_schema, BaseModel | ||
): | ||
response = self.client.localscraper( | ||
website_html=website_html, | ||
user_prompt=user_prompt, | ||
output_schema=self.llm_output_schema, | ||
) | ||
else: | ||
raise ValueError("llm_output_schema must be a Pydantic model class") | ||
|
||
return response["result"] | ||
|
||
async def _arun( | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "langchain-scrapegraph" | ||
version = "1.1.0" | ||
version = "1.2.0b1" | ||
description = "Library for extracting structured data from websites using ScrapeGraphAI" | ||
authors = ["Marco Perini <[email protected]>", "Marco Vinciguerra <[email protected]>", "Lorenzo Padoan <[email protected]>"] | ||
license = "MIT" | ||
|