From 6222a16a2fec477e7a6e610e0fdd4960e7ccd1b5 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 5 Dec 2024 18:35:41 +0000 Subject: [PATCH 1/4] ci(release): 1.1.0-beta.1 [skip ci] ## [1.1.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0-beta.1) (2024-12-05) ### Features * added markdownify and localscraper tools ([03e49dc](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/03e49dce84ef5a1b7a59b6dfd046eb563c14d283)) ### CI * **release:** 1.0.0-beta.1 [skip ci] ([fc06960](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/fc06960340c68ff325656074d47b0b793a3b25fe)) --- CHANGELOG.md | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14d999c..aa5cdbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## [1.1.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0-beta.1) (2024-12-05) + + +### Features + +* added markdownify and localscraper tools ([03e49dc](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/03e49dce84ef5a1b7a59b6dfd046eb563c14d283)) + + +### CI + +* **release:** 1.0.0-beta.1 [skip ci] ([fc06960](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/fc06960340c68ff325656074d47b0b793a3b25fe)) + ## 1.0.0 (2024-12-05) ## 1.0.0-beta.1 (2024-12-05) diff --git a/pyproject.toml b/pyproject.toml index 46b49c3..4ca4f7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-scrapegraph" -version = "1.0.0b1" +version = "1.1.0b1" description = "Library for extracting structured data from websites using ScrapeGraphAI" authors = ["Marco Perini ", "Marco Vinciguerra ", "Lorenzo Padoan "] license = "MIT" From 34b5f1089059daa25c756b44da593a7c0db97aa9 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Wed, 18 Dec 2024 17:31:53 +0100 Subject: [PATCH 2/4] =?UTF-8?q?feat:=20added=20pydantic=20output=20schema?= =?UTF-8?q?=20=F0=9F=94=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 81 +++++++++++++++++++++ examples/localscraper_tool_schema.py | 38 ++++++++++ examples/smartscraper_tool_schema.py | 26 +++++++ langchain_scrapegraph/tools/localscraper.py | 50 ++++++++++--- langchain_scrapegraph/tools/smartscraper.py | 40 +++++++++- 5 files changed, 219 insertions(+), 16 deletions(-) create mode 100644 examples/localscraper_tool_schema.py create mode 100644 examples/smartscraper_tool_schema.py diff --git a/README.md b/README.md index 510ea82..d672f6e 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,39 @@ result = tool.invoke({ print(result) ``` +
+🔍 Using Output Schemas with SmartscraperTool + +You can define the structure of the output using Pydantic models: + +```python +from typing import List +from pydantic import BaseModel, Field +from langchain_scrapegraph.tools import SmartscraperTool + +class WebsiteInfo(BaseModel): + title: str = Field(description="The main title of the webpage") + description: str = Field(description="The main description or first paragraph") + urls: List[str] = Field(description="The URLs inside the webpage") + +# Initialize with schema +tool = SmartscraperTool(llm_output_schema=WebsiteInfo) + +# The output will conform to the WebsiteInfo schema +result = tool.invoke({ + "website_url": "https://www.example.com", + "user_prompt": "Extract the website information" +}) + +print(result) +# { +# "title": "Example Domain", +# "description": "This domain is for use in illustrative examples...", +# "urls": ["https://www.iana.org/domains/example"] +# } +``` +
+ ### 💻 LocalscraperTool Extract information from HTML content using AI. @@ -59,6 +92,54 @@ result = tool.invoke({ print(result) ``` +
+🔍 Using Output Schemas with LocalscraperTool + +You can define the structure of the output using Pydantic models: + +```python +from typing import Optional +from pydantic import BaseModel, Field +from langchain_scrapegraph.tools import LocalscraperTool + +class CompanyInfo(BaseModel): + name: str = Field(description="The company name") + description: str = Field(description="The company description") + email: Optional[str] = Field(description="Contact email if available") + phone: Optional[str] = Field(description="Contact phone if available") + +# Initialize with schema +tool = LocalscraperTool(llm_output_schema=CompanyInfo) + +html_content = """ + + +

TechCorp Solutions

+

We are a leading AI technology company.

+
+

Email: contact@techcorp.com

+

Phone: (555) 123-4567

+
+ + +""" + +# The output will conform to the CompanyInfo schema +result = tool.invoke({ + "website_html": html_content, + "user_prompt": "Extract the company information" +}) + +print(result) +# { +# "name": "TechCorp Solutions", +# "description": "We are a leading AI technology company.", +# "email": "contact@techcorp.com", +# "phone": "(555) 123-4567" +# } +``` +
+ ## 🌟 Key Features - 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains diff --git a/examples/localscraper_tool_schema.py b/examples/localscraper_tool_schema.py new file mode 100644 index 0000000..85f3ab9 --- /dev/null +++ b/examples/localscraper_tool_schema.py @@ -0,0 +1,38 @@ +from typing import List + +from pydantic import BaseModel, Field +from scrapegraph_py.logger import sgai_logger + +from langchain_scrapegraph.tools import LocalScraperTool + + +class WebsiteInfo(BaseModel): + title: str = Field(description="The main title of the webpage") + description: str = Field(description="The main description or first paragraph") + urls: List[str] = Field(description="The URLs inside the webpage") + + +sgai_logger.set_logging(level="INFO") + +# Initialize with Pydantic model class +tool = LocalScraperTool(llm_output_schema=WebsiteInfo) + +# Example website and prompt +html_content = """ + + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+

Phone: (555) 123-4567

+
+ + +""" +user_prompt = "Make a summary of the webpage and extract the email and phone number" + +# Use the tool +result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt}) + +print(result) diff --git a/examples/smartscraper_tool_schema.py b/examples/smartscraper_tool_schema.py new file mode 100644 index 0000000..bded746 --- /dev/null +++ b/examples/smartscraper_tool_schema.py @@ -0,0 +1,26 @@ +from typing import List + +from pydantic import BaseModel, Field +from scrapegraph_py.logger import sgai_logger + +from langchain_scrapegraph.tools import SmartScraperTool + + +class WebsiteInfo(BaseModel): + title: str = Field(description="The main title of the webpage") + description: str = Field(description="The main description or first paragraph") + urls: List[str] = Field(description="The URLs inside the webpage") + + +sgai_logger.set_logging(level="INFO") + +# Initialize with Pydantic model class +tool = SmartScraperTool(llm_output_schema=WebsiteInfo) + +# Example website and prompt +website_url = "https://www.example.com" +user_prompt = "Extract info about the website" + +# Use the tool - output will conform to WebsiteInfo schema +result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt}) +print(result) diff --git a/langchain_scrapegraph/tools/localscraper.py b/langchain_scrapegraph/tools/localscraper.py index 0b7382c..926d6fd 100644 --- a/langchain_scrapegraph/tools/localscraper.py +++ b/langchain_scrapegraph/tools/localscraper.py @@ -37,6 +37,8 @@ class LocalScraperTool(BaseTool): Key init args: api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. client: Optional pre-configured ScrapeGraph client instance. + llm_output_schema: Optional Pydantic model or dictionary schema to structure the output. + If provided, the tool will ensure the output conforms to this schema. Instantiate: .. code-block:: python @@ -49,6 +51,16 @@ class LocalScraperTool(BaseTool): # Or provide API key directly tool = LocalScraperTool(api_key="your-api-key") + # Optionally, you can provide an output schema: + from pydantic import BaseModel, Field + + class CompanyInfo(BaseModel): + name: str = Field(description="Company name") + description: str = Field(description="Company description") + email: str = Field(description="Contact email") + + tool_with_schema = LocalScraperTool(llm_output_schema=CompanyInfo) + Use the tool: .. code-block:: python @@ -71,6 +83,7 @@ class LocalScraperTool(BaseTool): }) print(result) + # Without schema: # { # "description": "We are a technology company focused on AI solutions", # "contact": { @@ -78,14 +91,13 @@ class LocalScraperTool(BaseTool): # "phone": "(555) 123-4567" # } # } - - Async usage: - .. code-block:: python - - result = await tool.ainvoke({ - "user_prompt": "Extract contact information", - "website_html": html_content - }) + # + # With CompanyInfo schema: + # { + # "name": "Company Name", + # "description": "We are a technology company focused on AI solutions", + # "email": "contact@example.com" + # } """ name: str = "LocalScraper" @@ -96,6 +108,7 @@ class LocalScraperTool(BaseTool): return_direct: bool = True client: Optional[Client] = None api_key: str + llm_output_schema: Optional[Type[BaseModel]] = None @model_validator(mode="before") @classmethod @@ -117,10 +130,23 @@ def _run( """Use the tool to extract data from a website.""" if not self.client: raise ValueError("Client not initialized") - response = self.client.localscraper( - website_html=website_html, - user_prompt=user_prompt, - ) + + if self.llm_output_schema is None: + response = self.client.localscraper( + website_html=website_html, + user_prompt=user_prompt, + ) + elif isinstance(self.llm_output_schema, type) and issubclass( + self.llm_output_schema, BaseModel + ): + response = self.client.localscraper( + website_html=website_html, + user_prompt=user_prompt, + output_schema=self.llm_output_schema, + ) + else: + raise ValueError("llm_output_schema must be a Pydantic model class") + return response["result"] async def _arun( diff --git a/langchain_scrapegraph/tools/smartscraper.py b/langchain_scrapegraph/tools/smartscraper.py index a48030e..7b07915 100644 --- a/langchain_scrapegraph/tools/smartscraper.py +++ b/langchain_scrapegraph/tools/smartscraper.py @@ -37,6 +37,8 @@ class SmartScraperTool(BaseTool): Key init args: api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. client: Optional pre-configured ScrapeGraph client instance. + llm_output_schema: Optional Pydantic model or dictionary schema to structure the output. + If provided, the tool will ensure the output conforms to this schema. Instantiate: .. code-block:: python @@ -49,6 +51,15 @@ class SmartScraperTool(BaseTool): # Or provide API key directly tool = SmartScraperTool(api_key="your-api-key") + # Optionally, you can provide an output schema: + from pydantic import BaseModel, Field + + class WebsiteInfo(BaseModel): + title: str = Field(description="The main title") + description: str = Field(description="The main description") + + tool_with_schema = SmartScraperTool(llm_output_schema=WebsiteInfo) + Use the tool: .. code-block:: python @@ -58,10 +69,17 @@ class SmartScraperTool(BaseTool): }) print(result) + # Without schema: # { # "main_heading": "Example Domain", # "first_paragraph": "This domain is for use in illustrative examples..." # } + # + # With WebsiteInfo schema: + # { + # "title": "Example Domain", + # "description": "This domain is for use in illustrative examples..." + # } Async usage: .. code-block:: python @@ -80,6 +98,7 @@ class SmartScraperTool(BaseTool): return_direct: bool = True client: Optional[Client] = None api_key: str + llm_output_schema: Optional[Type[BaseModel]] = None @model_validator(mode="before") @classmethod @@ -101,10 +120,23 @@ def _run( """Use the tool to extract data from a website.""" if not self.client: raise ValueError("Client not initialized") - response = self.client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - ) + + if self.llm_output_schema is None: + response = self.client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + ) + elif isinstance(self.llm_output_schema, type) and issubclass( + self.llm_output_schema, BaseModel + ): + response = self.client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + output_schema=self.llm_output_schema, + ) + else: + raise ValueError("llm_output_schema must be a Pydantic model class") + return response["result"] async def _arun( From d5dae575921cfa14daa4ceb887b0d7d037d3773d Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 18 Dec 2024 16:34:05 +0000 Subject: [PATCH 3/4] ci(release): 1.1.0-beta.2 [skip ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## [1.1.0-beta.2](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0-beta.1...v1.1.0-beta.2) (2024-12-18) ### Features * added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa5cdbc..d0dfd1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.1.0-beta.2](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0-beta.1...v1.1.0-beta.2) (2024-12-18) + + +### Features + +* added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9)) + ## [1.1.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0-beta.1) (2024-12-05) diff --git a/pyproject.toml b/pyproject.toml index 4ca4f7b..9548722 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-scrapegraph" -version = "1.1.0b1" +version = "1.1.0b2" description = "Library for extracting structured data from websites using ScrapeGraphAI" authors = ["Marco Perini ", "Marco Vinciguerra ", "Lorenzo Padoan "] license = "MIT" From 9da0f957fa438ec6062ee3fb0cda2023ee262373 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 18 Dec 2024 16:48:13 +0000 Subject: [PATCH 4/4] ci(release): 1.2.0-beta.1 [skip ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## [1.2.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0...v1.2.0-beta.1) (2024-12-18) ### Features * added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9)) ### Docs * added API reference ([d3ce047](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d3ce0470f5c89da910540e42d71afdddd80e8c15)) ### CI * **release:** 1.1.0-beta.1 [skip ci] ([6222a16](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/6222a16a2fec477e7a6e610e0fdd4960e7ccd1b5)) * **release:** 1.1.0-beta.2 [skip ci] ([d5dae57](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d5dae575921cfa14daa4ceb887b0d7d037d3773d)) --- CHANGELOG.md | 17 +++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc8a37a..9e8e15d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,20 @@ +## [1.2.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0...v1.2.0-beta.1) (2024-12-18) + + +### Features + +* added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9)) + + +### Docs + +* added API reference ([d3ce047](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d3ce0470f5c89da910540e42d71afdddd80e8c15)) + + +### CI + +* **release:** 1.1.0-beta.1 [skip ci] ([6222a16](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/6222a16a2fec477e7a6e610e0fdd4960e7ccd1b5)) +* **release:** 1.1.0-beta.2 [skip ci] ([d5dae57](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d5dae575921cfa14daa4ceb887b0d7d037d3773d)) ## [1.1.0-beta.2](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0-beta.1...v1.1.0-beta.2) (2024-12-18) diff --git a/pyproject.toml b/pyproject.toml index a903766..c655c3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-scrapegraph" -version = "1.2.0b2" +version = "1.2.0b1" description = "Library for extracting structured data from websites using ScrapeGraphAI" authors = ["Marco Perini ", "Marco Vinciguerra ", "Lorenzo Padoan "] license = "MIT"