Merge pull request #3 from ScrapeGraphAI/pre/beta

Pre/beta
ScrapeGraphAI · Dec 18, 2024 · 43793f1 · 43793f1
2 parents d3ce047 + 9da0f95
commit 43793f1
Show file tree

Hide file tree

Showing 7 changed files with 247 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,30 @@
+## [1.2.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0...v1.2.0-beta.1) (2024-12-18)
+
+
+### Features
+
+* added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9))
+
+
+### Docs
+
+* added API reference ([d3ce047](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d3ce0470f5c89da910540e42d71afdddd80e8c15))
+
+
+### CI
+
+* **release:** 1.1.0-beta.1 [skip ci] ([6222a16](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/6222a16a2fec477e7a6e610e0fdd4960e7ccd1b5))
+* **release:** 1.1.0-beta.2 [skip ci] ([d5dae57](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d5dae575921cfa14daa4ceb887b0d7d037d3773d))
+
+## [1.1.0-beta.2](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0-beta.1...v1.1.0-beta.2) (2024-12-18)
+
+
+### Features
+
+* added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9))
+
+## [1.1.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0-beta.1) (2024-12-05)
+
 ## [1.1.0](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0) (2024-12-05)
 
 

diff --git a/README.md b/README.md
@@ -58,6 +58,39 @@ result = tool.invoke({
 print(result)
 ```
 
+<details>
+<summary>🔍 Using Output Schemas with SmartscraperTool</summary>
+
+You can define the structure of the output using Pydantic models:
+
+```python
+from typing import List
+from pydantic import BaseModel, Field
+from langchain_scrapegraph.tools import SmartscraperTool
+
+class WebsiteInfo(BaseModel):
+    title: str = Field(description="The main title of the webpage")
+    description: str = Field(description="The main description or first paragraph")
+    urls: List[str] = Field(description="The URLs inside the webpage")
+
+# Initialize with schema
+tool = SmartscraperTool(llm_output_schema=WebsiteInfo)
+
+# The output will conform to the WebsiteInfo schema
+result = tool.invoke({
+    "website_url": "https://www.example.com",
+    "user_prompt": "Extract the website information"
+})
+
+print(result)
+# {
+#     "title": "Example Domain",
+#     "description": "This domain is for use in illustrative examples...",
+#     "urls": ["https://www.iana.org/domains/example"]
+# }
+```
+</details>
+
 ### 💻 LocalscraperTool
 Extract information from HTML content using AI.
 
@@ -73,6 +106,54 @@ result = tool.invoke({
 print(result)
 ```
 
+<details>
+<summary>🔍 Using Output Schemas with LocalscraperTool</summary>
+
+You can define the structure of the output using Pydantic models:
+
+```python
+from typing import Optional
+from pydantic import BaseModel, Field
+from langchain_scrapegraph.tools import LocalscraperTool
+
+class CompanyInfo(BaseModel):
+    name: str = Field(description="The company name")
+    description: str = Field(description="The company description")
+    email: Optional[str] = Field(description="Contact email if available")
+    phone: Optional[str] = Field(description="Contact phone if available")
+
+# Initialize with schema
+tool = LocalscraperTool(llm_output_schema=CompanyInfo)
+
+html_content = """
+<html>
+    <body>
+        <h1>TechCorp Solutions</h1>
+        <p>We are a leading AI technology company.</p>
+        <div class="contact">
+            <p>Email: [email protected]</p>
+            <p>Phone: (555) 123-4567</p>
+        </div>
+    </body>
+</html>
+"""
+
+# The output will conform to the CompanyInfo schema
+result = tool.invoke({
+    "website_html": html_content,
+    "user_prompt": "Extract the company information"
+})
+
+print(result)
+# {
+#     "name": "TechCorp Solutions",
+#     "description": "We are a leading AI technology company.",
+#     "email": "[email protected]",
+#     "phone": "(555) 123-4567"
+# }
+```
+</details>
+
 ## 🌟 Key Features
 
 - 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains

diff --git a/examples/localscraper_tool_schema.py b/examples/localscraper_tool_schema.py
@@ -0,0 +1,38 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+from scrapegraph_py.logger import sgai_logger
+
+from langchain_scrapegraph.tools import LocalScraperTool
+
+
+class WebsiteInfo(BaseModel):
+    title: str = Field(description="The main title of the webpage")
+    description: str = Field(description="The main description or first paragraph")
+    urls: List[str] = Field(description="The URLs inside the webpage")
+
+
+sgai_logger.set_logging(level="INFO")
+
+# Initialize with Pydantic model class
+tool = LocalScraperTool(llm_output_schema=WebsiteInfo)
+
+# Example website and prompt
+html_content = """
+<html>
+    <body>
+        <h1>Company Name</h1>
+        <p>We are a technology company focused on AI solutions.</p>
+        <div class="contact">
+            <p>Email: [email protected]</p>
+            <p>Phone: (555) 123-4567</p>
+        </div>
+    </body>
+</html>
+"""
+user_prompt = "Make a summary of the webpage and extract the email and phone number"
+
+# Use the tool
+result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt})
+
+print(result)
diff --git a/examples/smartscraper_tool_schema.py b/examples/smartscraper_tool_schema.py
@@ -0,0 +1,26 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+from scrapegraph_py.logger import sgai_logger
+
+from langchain_scrapegraph.tools import SmartScraperTool
+
+
+class WebsiteInfo(BaseModel):
+    title: str = Field(description="The main title of the webpage")
+    description: str = Field(description="The main description or first paragraph")
+    urls: List[str] = Field(description="The URLs inside the webpage")
+
+
+sgai_logger.set_logging(level="INFO")
+
+# Initialize with Pydantic model class
+tool = SmartScraperTool(llm_output_schema=WebsiteInfo)
+
+# Example website and prompt
+website_url = "https://www.example.com"
+user_prompt = "Extract info about the website"
+
+# Use the tool - output will conform to WebsiteInfo schema
+result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt})
+print(result)
diff --git a/langchain_scrapegraph/tools/localscraper.py b/langchain_scrapegraph/tools/localscraper.py
@@ -37,6 +37,8 @@ class LocalScraperTool(BaseTool):
     Key init args:
         api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
         client: Optional pre-configured ScrapeGraph client instance.
+        llm_output_schema: Optional Pydantic model or dictionary schema to structure the output.
+                      If provided, the tool will ensure the output conforms to this schema.
 
     Instantiate:
         .. code-block:: python
@@ -49,6 +51,16 @@ class LocalScraperTool(BaseTool):
             # Or provide API key directly
             tool = LocalScraperTool(api_key="your-api-key")
 
+            # Optionally, you can provide an output schema:
+            from pydantic import BaseModel, Field
+
+            class CompanyInfo(BaseModel):
+                name: str = Field(description="Company name")
+                description: str = Field(description="Company description")
+                email: str = Field(description="Contact email")
+
+            tool_with_schema = LocalScraperTool(llm_output_schema=CompanyInfo)
+
     Use the tool:
         .. code-block:: python
 
@@ -71,21 +83,21 @@ class LocalScraperTool(BaseTool):
             })
 
             print(result)
+            # Without schema:
             # {
             #     "description": "We are a technology company focused on AI solutions",
             #     "contact": {
             #         "email": "[email protected]",
             #         "phone": "(555) 123-4567"
             #     }
             # }
-
-    Async usage:
-        .. code-block:: python
-
-            result = await tool.ainvoke({
-                "user_prompt": "Extract contact information",
-                "website_html": html_content
-            })
+            #
+            # With CompanyInfo schema:
+            # {
+            #     "name": "Company Name",
+            #     "description": "We are a technology company focused on AI solutions",
+            #     "email": "[email protected]"
+            # }
     """
 
     name: str = "LocalScraper"
@@ -96,6 +108,7 @@ class LocalScraperTool(BaseTool):
     return_direct: bool = True
     client: Optional[Client] = None
     api_key: str
+    llm_output_schema: Optional[Type[BaseModel]] = None
 
     @model_validator(mode="before")
     @classmethod
@@ -117,10 +130,23 @@ def _run(
         """Use the tool to extract data from a website."""
         if not self.client:
             raise ValueError("Client not initialized")
-        response = self.client.localscraper(
-            website_html=website_html,
-            user_prompt=user_prompt,
-        )
+
+        if self.llm_output_schema is None:
+            response = self.client.localscraper(
+                website_html=website_html,
+                user_prompt=user_prompt,
+            )
+        elif isinstance(self.llm_output_schema, type) and issubclass(
+            self.llm_output_schema, BaseModel
+        ):
+            response = self.client.localscraper(
+                website_html=website_html,
+                user_prompt=user_prompt,
+                output_schema=self.llm_output_schema,
+            )
+        else:
+            raise ValueError("llm_output_schema must be a Pydantic model class")
+
         return response["result"]
 
     async def _arun(

diff --git a/langchain_scrapegraph/tools/smartscraper.py b/langchain_scrapegraph/tools/smartscraper.py
@@ -37,6 +37,8 @@ class SmartScraperTool(BaseTool):
     Key init args:
         api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
         client: Optional pre-configured ScrapeGraph client instance.
+        llm_output_schema: Optional Pydantic model or dictionary schema to structure the output.
+                      If provided, the tool will ensure the output conforms to this schema.
 
     Instantiate:
         .. code-block:: python
@@ -49,6 +51,15 @@ class SmartScraperTool(BaseTool):
             # Or provide API key directly
             tool = SmartScraperTool(api_key="your-api-key")
 
+            # Optionally, you can provide an output schema:
+            from pydantic import BaseModel, Field
+
+            class WebsiteInfo(BaseModel):
+                title: str = Field(description="The main title")
+                description: str = Field(description="The main description")
+
+            tool_with_schema = SmartScraperTool(llm_output_schema=WebsiteInfo)
+
     Use the tool:
         .. code-block:: python
 
@@ -58,10 +69,17 @@ class SmartScraperTool(BaseTool):
             })
 
             print(result)
+            # Without schema:
             # {
             #     "main_heading": "Example Domain",
             #     "first_paragraph": "This domain is for use in illustrative examples..."
             # }
+            #
+            # With WebsiteInfo schema:
+            # {
+            #     "title": "Example Domain",
+            #     "description": "This domain is for use in illustrative examples..."
+            # }
 
     Async usage:
         .. code-block:: python
@@ -80,6 +98,7 @@ class SmartScraperTool(BaseTool):
     return_direct: bool = True
     client: Optional[Client] = None
     api_key: str
+    llm_output_schema: Optional[Type[BaseModel]] = None
 
     @model_validator(mode="before")
     @classmethod
@@ -101,10 +120,23 @@ def _run(
         """Use the tool to extract data from a website."""
         if not self.client:
             raise ValueError("Client not initialized")
-        response = self.client.smartscraper(
-            website_url=website_url,
-            user_prompt=user_prompt,
-        )
+
+        if self.llm_output_schema is None:
+            response = self.client.smartscraper(
+                website_url=website_url,
+                user_prompt=user_prompt,
+            )
+        elif isinstance(self.llm_output_schema, type) and issubclass(
+            self.llm_output_schema, BaseModel
+        ):
+            response = self.client.smartscraper(
+                website_url=website_url,
+                user_prompt=user_prompt,
+                output_schema=self.llm_output_schema,
+            )
+        else:
+            raise ValueError("llm_output_schema must be a Pydantic model class")
+
         return response["result"]
 
     async def _arun(

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain-scrapegraph"
-version = "1.1.0"
+version = "1.2.0b1"
 description = "Library for extracting structured data from websites using ScrapeGraphAI"
 authors = ["Marco Perini <[email protected]>", "Marco Vinciguerra <[email protected]>", "Lorenzo Padoan <[email protected]>"]
 license = "MIT"