diff --git a/README.md b/README.md index 0e6e4ae..510ea82 100644 --- a/README.md +++ b/README.md @@ -1 +1,140 @@ -# langchain-scrapegraph \ No newline at end of file +# 🕷️🦜 langchain-scrapegraph + +[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![Python Support](https://img.shields.io/pypi/pyversions/langchain-scrapegraph.svg)](https://pypi.org/project/langchain-scrapegraph/) +[![Documentation](https://img.shields.io/badge/Documentation-Latest-green)](https://scrapegraphai.com/docs) + +Supercharge your LangChain agents with AI-powered web scraping capabilities. LangChain-ScrapeGraph provides a seamless integration between [LangChain](https://github.com/langchain-ai/langchain) and [ScrapeGraph AI](https://scrapegraphai.com), enabling your agents to extract structured data from websites using natural language. + +## 📦 Installation + +```bash +pip install langchain-scrapegraph +``` + +## 🛠️ Available Tools + +### 📝 MarkdownifyTool +Convert any webpage into clean, formatted markdown. + +```python +from langchain_scrapegraph.tools import MarkdownifyTool + +tool = MarkdownifyTool() +markdown = tool.invoke({"website_url": "https://example.com"}) + +print(markdown) +``` + +### 🔍 SmartscraperTool +Extract structured data from any webpage using natural language prompts. + +```python +from langchain_scrapegraph.tools import SmartscraperTool + +# Initialize the tool (uses SGAI_API_KEY from environment) +tool = SmartscraperTool() + +# Extract information using natural language +result = tool.invoke({ + "website_url": "https://www.example.com", + "user_prompt": "Extract the main heading and first paragraph" +}) + +print(result) +``` + +### 💻 LocalscraperTool +Extract information from HTML content using AI. + +```python +from langchain_scrapegraph.tools import LocalscraperTool + +tool = LocalscraperTool() +result = tool.invoke({ + "user_prompt": "Extract all contact information", + "website_html": "..." +}) + +print(result) +``` + +## 🌟 Key Features + +- 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains +- 🔍 **AI-Powered Extraction**: Use natural language to describe what data to extract +- 📊 **Structured Output**: Get clean, structured data ready for your agents +- 🔄 **Flexible Tools**: Choose from multiple specialized scraping tools +- ⚡ **Async Support**: Built-in support for async operations + +## 💡 Use Cases + +- 📖 **Research Agents**: Create agents that gather and analyze web data +- 📊 **Data Collection**: Automate structured data extraction from websites +- 📝 **Content Processing**: Convert web content into markdown for further processing +- 🔍 **Information Extraction**: Extract specific data points using natural language + +## 🤖 Example Agent + +```python +from langchain.agents import initialize_agent, AgentType +from langchain_scrapegraph.tools import SmartscraperTool +from langchain_openai import ChatOpenAI + +# Initialize tools +tools = [ + SmartscraperTool(), +] + +# Create an agent +agent = initialize_agent( + tools=tools, + llm=ChatOpenAI(temperature=0), + agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, + verbose=True +) + +# Use the agent +response = agent.run(""" + Visit example.com, make a summary of the content and extract the main heading and first paragraph +""") +``` + +## ⚙️ Configuration + +Set your ScrapeGraph API key in your environment: +```bash +export SGAI_API_KEY="your-api-key-here" +``` + +Or set it programmatically: +```python +import os +os.environ["SGAI_API_KEY"] = "your-api-key-here" +``` + +## 📚 Documentation + +- [API Documentation](https://scrapegraphai.com/docs) +- [LangChain Documentation](https://python.langchain.com/docs/get_started/introduction.html) +- [Examples](examples/) + +## 💬 Support & Feedback + +- 📧 Email: support@scrapegraphai.com +- 💻 GitHub Issues: [Create an issue](https://github.com/ScrapeGraphAI/langchain-scrapegraph/issues) +- 🌟 Feature Requests: [Request a feature](https://github.com/ScrapeGraphAI/langchain-scrapegraph/issues/new) + +## 📄 License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + +## 🙏 Acknowledgments + +This project is built on top of: +- [LangChain](https://github.com/langchain-ai/langchain) +- [ScrapeGraph AI](https://scrapegraphai.com) + +--- + +Made with ❤️ by [ScrapeGraph AI](https://scrapegraphai.com) diff --git a/examples/agent_example.py b/examples/agent_example.py new file mode 100644 index 0000000..9e61fba --- /dev/null +++ b/examples/agent_example.py @@ -0,0 +1,57 @@ +""" +Remember to install the additional dependencies for this example to work: +pip install langchain-openai langchain +""" + +from dotenv import load_dotenv +from langchain.agents import AgentExecutor, create_openai_functions_agent +from langchain_core.messages import SystemMessage +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_openai import ChatOpenAI + +from langchain_scrapegraph.tools import ( + GetCreditsTool, + LocalScraperTool, + SmartScraperTool, +) + +load_dotenv() + +# Initialize the tools +tools = [ + SmartScraperTool(), + LocalScraperTool(), + GetCreditsTool(), +] + +# Create the prompt template +prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content=( + "You are a helpful AI assistant that can analyze websites and extract information. " + "You have access to tools that can help you scrape and process web content. " + "Always explain what you're doing before using a tool." + ) + ), + MessagesPlaceholder(variable_name="chat_history", optional=True), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), + ] +) + +# Initialize the LLM +llm = ChatOpenAI(temperature=0) + +# Create the agent +agent = create_openai_functions_agent(llm, tools, prompt) + +# Create the executor +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + +# Example usage +query = """Extract the main products from https://www.scrapegraphai.com/""" + +print("\nQuery:", query, "\n") +response = agent_executor.invoke({"input": query}) +print("\nFinal Response:", response["output"]) diff --git a/examples/get_credits_tool.py b/examples/get_credits_tool.py index 50d6139..0645fe9 100644 --- a/examples/get_credits_tool.py +++ b/examples/get_credits_tool.py @@ -1,9 +1,13 @@ +from scrapegraph_py.logger import sgai_logger + from langchain_scrapegraph.tools import GetCreditsTool -# Will automatically get SGAI_API_KEY from environment, or set it manually +sgai_logger.set_logging(level="INFO") + +# Will automatically get SGAI_API_KEY from environment tool = GetCreditsTool() -credits = tool.run() -print("\nCredits Information:") -print(f"Remaining Credits: {credits['remaining_credits']}") -print(f"Total Credits Used: {credits['total_credits_used']}") +# Use the tool +credits = tool.invoke({}) + +print(credits) diff --git a/examples/localscraper_tool.py b/examples/localscraper_tool.py new file mode 100644 index 0000000..a8df8ee --- /dev/null +++ b/examples/localscraper_tool.py @@ -0,0 +1,28 @@ +from scrapegraph_py.logger import sgai_logger + +from langchain_scrapegraph.tools import LocalScraperTool + +sgai_logger.set_logging(level="INFO") + +# Will automatically get SGAI_API_KEY from environment +tool = LocalScraperTool() + +# Example website and prompt +html_content = """ + + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+

Phone: (555) 123-4567

+
+ + +""" +user_prompt = "Make a summary of the webpage and extract the email and phone number" + +# Use the tool +result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt}) + +print(result) diff --git a/examples/markdownify_tool.py b/examples/markdownify_tool.py new file mode 100644 index 0000000..32dc76f --- /dev/null +++ b/examples/markdownify_tool.py @@ -0,0 +1,16 @@ +from scrapegraph_py.logger import sgai_logger + +from langchain_scrapegraph.tools import MarkdownifyTool + +sgai_logger.set_logging(level="INFO") + +# Will automatically get SGAI_API_KEY from environment +tool = MarkdownifyTool() + +# Example website and prompt +website_url = "https://www.example.com" + +# Use the tool +result = tool.invoke({"website_url": website_url}) + +print(result) diff --git a/examples/smartscraper_tool.py b/examples/smartscraper_tool.py index f222c69..9f31ba1 100644 --- a/examples/smartscraper_tool.py +++ b/examples/smartscraper_tool.py @@ -1,15 +1,17 @@ -from langchain_scrapegraph.tools import SmartscraperTool +from scrapegraph_py.logger import sgai_logger -# Will automatically get SGAI_API_KEY from environment, or set it manually -tool = SmartscraperTool() +from langchain_scrapegraph.tools import SmartScraperTool + +sgai_logger.set_logging(level="INFO") + +# Will automatically get SGAI_API_KEY from environment +tool = SmartScraperTool() # Example website and prompt website_url = "https://www.example.com" user_prompt = "Extract the main heading and first paragraph from this webpage" -# Use the tool synchronously -result = tool.run({"user_prompt": user_prompt, "website_url": website_url}) +# Use the tool +result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt}) -print("\nExtraction Results:") -print(f"Main Heading: {result['main_heading']}") -print(f"First Paragraph: {result['first_paragraph']}") +print(result) diff --git a/langchain_scrapegraph/tools/__init__.py b/langchain_scrapegraph/tools/__init__.py index 76c0b8e..a61f301 100644 --- a/langchain_scrapegraph/tools/__init__.py +++ b/langchain_scrapegraph/tools/__init__.py @@ -1,4 +1,6 @@ from .credits import GetCreditsTool -from .smartscraper import SmartscraperTool +from .localscraper import LocalScraperTool +from .markdownify import MarkdownifyTool +from .smartscraper import SmartScraperTool -__all__ = ["SmartscraperTool", "GetCreditsTool"] +__all__ = ["SmartScraperTool", "GetCreditsTool", "MarkdownifyTool", "LocalScraperTool"] diff --git a/langchain_scrapegraph/tools/credits.py b/langchain_scrapegraph/tools/credits.py index f57ad85..d4ea94e 100644 --- a/langchain_scrapegraph/tools/credits.py +++ b/langchain_scrapegraph/tools/credits.py @@ -7,25 +7,72 @@ from langchain_core.tools import BaseTool from langchain_core.utils import get_from_dict_or_env from pydantic import model_validator -from scrapegraph_py import SyncClient +from scrapegraph_py import Client class GetCreditsTool(BaseTool): + """Tool for checking remaining credits on your ScrapeGraph AI account. + + Setup: + Install ``langchain-scrapegraph`` python package: + + .. code-block:: bash + + pip install langchain-scrapegraph + + Get your API key from ScrapeGraph AI (https://scrapegraphai.com) + and set it as an environment variable: + + .. code-block:: bash + + export SGAI_API_KEY="your-api-key" + + Key init args: + api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. + client: Optional pre-configured ScrapeGraph client instance. + + Instantiate: + .. code-block:: python + + from langchain_scrapegraph.tools import GetCreditsTool + + # Will automatically get SGAI_API_KEY from environment + tool = GetCreditsTool() + + # Or provide API key directly + tool = GetCreditsTool(api_key="your-api-key") + + Use the tool: + .. code-block:: python + + result = tool.invoke({}) + + print(result) + # { + # "remaining_credits": 100, + # "total_credits_used": 50 + # } + + Async usage: + .. code-block:: python + + result = await tool.ainvoke({}) + """ + name: str = "GetCredits" description: str = ( "Get the current credits available in your ScrapeGraph AI account" ) return_direct: bool = True - client: Optional[SyncClient] = None + client: Optional[Client] = None api_key: str - testing: bool = False @model_validator(mode="before") @classmethod def validate_environment(cls, values: Dict) -> Dict: """Validate that api key exists in environment.""" values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY") - values["client"] = SyncClient(api_key=values["api_key"]) + values["client"] = Client(api_key=values["api_key"]) return values def __init__(self, **data: Any): diff --git a/langchain_scrapegraph/tools/localscraper.py b/langchain_scrapegraph/tools/localscraper.py new file mode 100644 index 0000000..0b7382c --- /dev/null +++ b/langchain_scrapegraph/tools/localscraper.py @@ -0,0 +1,137 @@ +from typing import Any, Dict, Optional, Type + +from langchain_core.callbacks import ( + AsyncCallbackManagerForToolRun, + CallbackManagerForToolRun, +) +from langchain_core.tools import BaseTool +from langchain_core.utils import get_from_dict_or_env +from pydantic import BaseModel, Field, model_validator +from scrapegraph_py import Client + + +class LocalscraperInput(BaseModel): + user_prompt: str = Field( + description="Prompt describing what to extract from the webpage and how to structure the output" + ) + website_html: str = Field(description="HTML of the webpage to extract data from") + + +class LocalScraperTool(BaseTool): + """Tool for extracting structured data from a local HTML file using ScrapeGraph AI. + + Setup: + Install ``langchain-scrapegraph`` python package: + + .. code-block:: bash + + pip install langchain-scrapegraph + + Get your API key from ScrapeGraph AI (https://scrapegraphai.com) + and set it as an environment variable: + + .. code-block:: bash + + export SGAI_API_KEY="your-api-key" + + Key init args: + api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. + client: Optional pre-configured ScrapeGraph client instance. + + Instantiate: + .. code-block:: python + + from langchain_scrapegraph.tools import LocalScraperTool + + # Will automatically get SGAI_API_KEY from environment + tool = LocalScraperTool() + + # Or provide API key directly + tool = LocalScraperTool(api_key="your-api-key") + + Use the tool: + .. code-block:: python + + html_content = ''' + + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+

Phone: (555) 123-4567

+
+ + + ''' + + result = tool.invoke({ + "user_prompt": "Extract company description and contact info", + "website_html": html_content + }) + + print(result) + # { + # "description": "We are a technology company focused on AI solutions", + # "contact": { + # "email": "contact@example.com", + # "phone": "(555) 123-4567" + # } + # } + + Async usage: + .. code-block:: python + + result = await tool.ainvoke({ + "user_prompt": "Extract contact information", + "website_html": html_content + }) + """ + + name: str = "LocalScraper" + description: str = ( + "Useful when you need to extract structured data from a HTML webpage, applying also some reasoning using LLM, by providing an HTML string and an extraction prompt" + ) + args_schema: Type[BaseModel] = LocalscraperInput + return_direct: bool = True + client: Optional[Client] = None + api_key: str + + @model_validator(mode="before") + @classmethod + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key exists in environment.""" + values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY") + values["client"] = Client(api_key=values["api_key"]) + return values + + def __init__(self, **data: Any): + super().__init__(**data) + + def _run( + self, + user_prompt: str, + website_html: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> dict: + """Use the tool to extract data from a website.""" + if not self.client: + raise ValueError("Client not initialized") + response = self.client.localscraper( + website_html=website_html, + user_prompt=user_prompt, + ) + return response["result"] + + async def _arun( + self, + user_prompt: str, + website_html: str, + run_manager: Optional[AsyncCallbackManagerForToolRun] = None, + ) -> str: + """Use the tool asynchronously.""" + return self._run( + user_prompt, + website_html, + run_manager=run_manager.get_sync() if run_manager else None, + ) diff --git a/langchain_scrapegraph/tools/markdownify.py b/langchain_scrapegraph/tools/markdownify.py new file mode 100644 index 0000000..4750f5b --- /dev/null +++ b/langchain_scrapegraph/tools/markdownify.py @@ -0,0 +1,109 @@ +from typing import Any, Dict, Optional, Type + +from langchain_core.callbacks import ( + AsyncCallbackManagerForToolRun, + CallbackManagerForToolRun, +) +from langchain_core.tools import BaseTool +from langchain_core.utils import get_from_dict_or_env +from pydantic import BaseModel, Field, model_validator +from scrapegraph_py import Client + + +class MarkdownifyInput(BaseModel): + website_url: str = Field(description="Url of the website to convert to Markdown") + + +class MarkdownifyTool(BaseTool): + """Tool for converting webpages to Markdown format using ScrapeGraph AI. + + Setup: + Install ``langchain-scrapegraph`` python package: + + .. code-block:: bash + + pip install langchain-scrapegraph + + Get your API key from ScrapeGraph AI (https://scrapegraphai.com) + and set it as an environment variable: + + .. code-block:: bash + + export SGAI_API_KEY="your-api-key" + + Key init args: + api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. + client: Optional pre-configured ScrapeGraph client instance. + + Instantiate: + .. code-block:: python + + from langchain_scrapegraph.tools import MarkdownifyTool + + # Will automatically get SGAI_API_KEY from environment + tool = MarkdownifyTool() + + # Or provide API key directly + tool = MarkdownifyTool(api_key="your-api-key") + + Use the tool: + .. code-block:: python + + result = tool.invoke({ + "website_url": "https://example.com" + }) + + print(result) + # # Example Domain + # + # This domain is for use in illustrative examples... + + Async usage: + .. code-block:: python + + result = await tool.ainvoke({ + "website_url": "https://example.com" + }) + """ + + name: str = "Markdownify" + description: str = ( + "Useful when you need to convert a webpage to Markdown, given a URL as input" + ) + args_schema: Type[BaseModel] = MarkdownifyInput + return_direct: bool = True + client: Optional[Client] = None + api_key: str + + @model_validator(mode="before") + @classmethod + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key exists in environment.""" + values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY") + values["client"] = Client(api_key=values["api_key"]) + return values + + def __init__(self, **data: Any): + super().__init__(**data) + + def _run( + self, + website_url: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> dict: + """Use the tool to extract data from a website.""" + if not self.client: + raise ValueError("Client not initialized") + response = self.client.markdownify(website_url=website_url) + return response["result"] + + async def _arun( + self, + website_url: str, + run_manager: Optional[AsyncCallbackManagerForToolRun] = None, + ) -> str: + """Use the tool asynchronously.""" + return self._run( + website_url, + run_manager=run_manager.get_sync() if run_manager else None, + ) diff --git a/langchain_scrapegraph/tools/smartscraper.py b/langchain_scrapegraph/tools/smartscraper.py index a292a99..a48030e 100644 --- a/langchain_scrapegraph/tools/smartscraper.py +++ b/langchain_scrapegraph/tools/smartscraper.py @@ -7,33 +7,86 @@ from langchain_core.tools import BaseTool from langchain_core.utils import get_from_dict_or_env from pydantic import BaseModel, Field, model_validator -from scrapegraph_py import SyncClient +from scrapegraph_py import Client -class SmartscraperInput(BaseModel): +class SmartScraperInput(BaseModel): user_prompt: str = Field( - description="Prompt describing what to extract from the website and how to structure the output" + description="Prompt describing what to extract from the webpage and how to structure the output" ) - website_url: str = Field(description="Url of the website to extract data from") + website_url: str = Field(description="Url of the webpage to extract data from") -class SmartscraperTool(BaseTool): - name: str = "Smartscraper" +class SmartScraperTool(BaseTool): + """Tool for extracting structured data from websites using ScrapeGraph AI. + + Setup: + Install ``langchain-scrapegraph`` python package: + + .. code-block:: bash + + pip install langchain-scrapegraph + + Get your API key from ScrapeGraph AI (https://scrapegraphai.com) + and set it as an environment variable: + + .. code-block:: bash + + export SGAI_API_KEY="your-api-key" + + Key init args: + api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var. + client: Optional pre-configured ScrapeGraph client instance. + + Instantiate: + .. code-block:: python + + from langchain_scrapegraph.tools import SmartScraperTool + + # Will automatically get SGAI_API_KEY from environment + tool = SmartScraperTool() + + # Or provide API key directly + tool = SmartScraperTool(api_key="your-api-key") + + Use the tool: + .. code-block:: python + + result = tool.invoke({ + "user_prompt": "Extract the main heading and first paragraph", + "website_url": "https://example.com" + }) + + print(result) + # { + # "main_heading": "Example Domain", + # "first_paragraph": "This domain is for use in illustrative examples..." + # } + + Async usage: + .. code-block:: python + + result = await tool.ainvoke({ + "user_prompt": "Extract the main heading", + "website_url": "https://example.com" + }) + """ + + name: str = "SmartScraper" description: str = ( - "Useful for when you need to extract structured data from a website, applying also some preprocessing reasoning using LLM" + "Useful when you need to extract structured data from a webpage, applying also some reasoning using LLM, by providing a webpage URL and an extraction prompt" ) - args_schema: Type[BaseModel] = SmartscraperInput + args_schema: Type[BaseModel] = SmartScraperInput return_direct: bool = True - client: Optional[SyncClient] = None + client: Optional[Client] = None api_key: str - testing: bool = False @model_validator(mode="before") @classmethod def validate_environment(cls, values: Dict) -> Dict: """Validate that api key exists in environment.""" values["api_key"] = get_from_dict_or_env(values, "api_key", "SGAI_API_KEY") - values["client"] = SyncClient(api_key=values["api_key"]) + values["client"] = Client(api_key=values["api_key"]) return values def __init__(self, **data: Any): diff --git a/pyproject.toml b/pyproject.toml index b59b186..99a268d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ packages = [{include = "langchain_scrapegraph"}] [tool.poetry.dependencies] python = ">=3.10,<4.0" langchain-core = "^0.3.21" -scrapegraph-py = "^1.5.0" +scrapegraph-py = "^1.7.0" [tool.poetry.group.test.dependencies] pytest = "^8.3.4" diff --git a/tests/integration_tests/test_tools.py b/tests/integration_tests/test_tools.py index 97a89e6..13fbf9d 100644 --- a/tests/integration_tests/test_tools.py +++ b/tests/integration_tests/test_tools.py @@ -13,16 +13,21 @@ from dotenv import load_dotenv from langchain_tests.integration_tests import ToolsIntegrationTests -from langchain_scrapegraph.tools import GetCreditsTool, SmartscraperTool +from langchain_scrapegraph.tools import ( + GetCreditsTool, + LocalScraperTool, + MarkdownifyTool, + SmartScraperTool, +) # Load environment variables from .env file load_dotenv() -class TestSmartscraperToolIntegration(ToolsIntegrationTests): +class TestSmartScraperToolIntegration(ToolsIntegrationTests): @property - def tool_constructor(self) -> Type[SmartscraperTool]: - return SmartscraperTool + def tool_constructor(self) -> Type[SmartScraperTool]: + return SmartScraperTool @property def tool_constructor_params(self) -> dict: @@ -53,4 +58,52 @@ def tool_constructor_params(self) -> dict: @property def tool_invoke_params_example(self) -> dict: - return {} # GetCredits doesn't require any parameters + return {} + + +class TestMarkdownifyToolIntegration(ToolsIntegrationTests): + @property + def tool_constructor(self) -> Type[MarkdownifyTool]: + return MarkdownifyTool + + @property + def tool_constructor_params(self) -> dict: + api_key = os.getenv("SGAI_API_KEY") + if not api_key: + pytest.skip("SGAI_API_KEY environment variable not set") + return {"api_key": api_key} + + @property + def tool_invoke_params_example(self) -> dict: + return {"website_url": "https://example.com"} + + +class TestLocalScraperToolIntegration(ToolsIntegrationTests): + @property + def tool_constructor(self) -> Type[LocalScraperTool]: + return LocalScraperTool + + @property + def tool_constructor_params(self) -> dict: + api_key = os.getenv("SGAI_API_KEY") + if not api_key: + pytest.skip("SGAI_API_KEY environment variable not set") + return {"api_key": api_key} + + @property + def tool_invoke_params_example(self) -> dict: + return { + "user_prompt": "Make a summary and extract contact info", + "website_html": """ + + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+

Phone: (555) 123-4567

+
+ + + """, + } diff --git a/tests/unit_tests/mocks.py b/tests/unit_tests/mocks.py index 2da0174..740b0d2 100644 --- a/tests/unit_tests/mocks.py +++ b/tests/unit_tests/mocks.py @@ -4,7 +4,7 @@ from pydantic import BaseModel, Field -class MockSyncClient: +class MockClient: def __init__(self, api_key: str = None, *args, **kwargs): """Initialize with mock methods that return proper response structures""" self._api_key = api_key @@ -27,21 +27,53 @@ def get_credits(self) -> dict: """Mock get_credits method""" return {"remaining_credits": 50, "total_credits_used": 543} + def markdownify(self, website_url: str) -> dict: + """Mock markdownify method""" + return { + "request_id": "test-id", + "status": "completed", + "website_url": website_url, + "result": "# Example Domain\n\nTest paragraph", + "error": "", + } + + def localscraper(self, website_html: str, user_prompt: str) -> dict: + """Mock localscraper method""" + return { + "request_id": "test-id", + "status": "completed", + "user_prompt": user_prompt, + "result": { + "summary": "This is a technology company", + "contact": {"email": "contact@example.com", "phone": "(555) 123-4567"}, + }, + "error": "", + } + def close(self) -> None: """Mock close method""" pass -class MockSmartscraperInput(BaseModel): +class MockSmartScraperInput(BaseModel): user_prompt: str = Field(description="Test prompt") website_url: str = Field(description="Test URL") -class MockSmartscraperTool(BaseTool): - name: str = "Smartscraper" +class MockMarkdownifyInput(BaseModel): + website_url: str = Field(description="Test URL") + + +class MockLocalScraperInput(BaseModel): + user_prompt: str = Field(description="Test prompt") + website_html: str = Field(description="Test HTML") + + +class MockSmartScraperTool(BaseTool): + name: str = "SmartScraper" description: str = "Test description" - args_schema: type[BaseModel] = MockSmartscraperInput - client: Optional[MockSyncClient] = None + args_schema: type[BaseModel] = MockSmartScraperInput + client: Optional[MockClient] = None api_key: str def _run(self, **kwargs: Any) -> Dict: @@ -51,8 +83,33 @@ def _run(self, **kwargs: Any) -> Dict: class MockGetCreditsTool(BaseTool): name: str = "GetCredits" description: str = "Test description" - client: Optional[MockSyncClient] = None + client: Optional[MockClient] = None api_key: str def _run(self, **kwargs: Any) -> Dict: return {"remaining_credits": 50, "total_credits_used": 543} + + +class MockMarkdownifyTool(BaseTool): + name: str = "Markdownify" + description: str = "Test description" + args_schema: type[BaseModel] = MockMarkdownifyInput + client: Optional[MockClient] = None + api_key: str + + def _run(self, **kwargs: Any) -> str: + return "# Example Domain\n\nTest paragraph" + + +class MockLocalScraperTool(BaseTool): + name: str = "LocalScraper" + description: str = "Test description" + args_schema: type[BaseModel] = MockLocalScraperInput + client: Optional[MockClient] = None + api_key: str + + def _run(self, **kwargs: Any) -> Dict: + return { + "summary": "This is a technology company", + "contact": {"email": "contact@example.com", "phone": "(555) 123-4567"}, + } diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index fe434e3..2ac0876 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -3,24 +3,29 @@ from langchain_tests.unit_tests import ToolsUnitTests -from langchain_scrapegraph.tools import GetCreditsTool, SmartscraperTool +from langchain_scrapegraph.tools import ( + GetCreditsTool, + LocalScraperTool, + MarkdownifyTool, + SmartScraperTool, +) from tests.unit_tests.mocks import ( + MockClient, MockGetCreditsTool, - MockSmartscraperTool, - MockSyncClient, + MockLocalScraperTool, + MockMarkdownifyTool, + MockSmartScraperTool, ) -class TestSmartscraperToolUnit(ToolsUnitTests): +class TestSmartScraperToolUnit(ToolsUnitTests): @property - def tool_constructor(self) -> Type[SmartscraperTool]: - return MockSmartscraperTool + def tool_constructor(self) -> Type[SmartScraperTool]: + return MockSmartScraperTool @property def tool_constructor_params(self) -> dict: - with patch( - "langchain_scrapegraph.tools.smartscraper.SyncClient", MockSyncClient - ): + with patch("langchain_scrapegraph.tools.smartscraper.Client", MockClient): return {"api_key": "sgai-test-api-key"} @property @@ -38,9 +43,42 @@ def tool_constructor(self) -> Type[GetCreditsTool]: @property def tool_constructor_params(self) -> dict: - with patch("langchain_scrapegraph.tools.credits.SyncClient", MockSyncClient): + with patch("langchain_scrapegraph.tools.credits.Client", MockClient): return {"api_key": "sgai-test-api-key"} @property def tool_invoke_params_example(self) -> dict: return {} + + +class TestMarkdownifyToolUnit(ToolsUnitTests): + @property + def tool_constructor(self) -> Type[MarkdownifyTool]: + return MockMarkdownifyTool + + @property + def tool_constructor_params(self) -> dict: + with patch("langchain_scrapegraph.tools.markdownify.Client", MockClient): + return {"api_key": "sgai-test-api-key"} + + @property + def tool_invoke_params_example(self) -> dict: + return {"website_url": "https://example.com"} + + +class TestLocalScraperToolUnit(ToolsUnitTests): + @property + def tool_constructor(self) -> Type[LocalScraperTool]: + return MockLocalScraperTool + + @property + def tool_constructor_params(self) -> dict: + with patch("langchain_scrapegraph.tools.localscraper.Client", MockClient): + return {"api_key": "sgai-test-api-key"} + + @property + def tool_invoke_params_example(self) -> dict: + return { + "user_prompt": "Make a summary and extract contact info", + "website_html": "

Test

", + }