From e58e799bef422888fa49d802a46673d665fa4dfd Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Fri, 20 Dec 2024 09:50:41 -0300 Subject: [PATCH] fix(git.py): enhance GitLoaderComponent with improved file filtering and binary check - Refactor file filtering logic to utilize fnmatch for pattern matching. - Introduce a new method to check for binary files based on null byte detection. - Update content filtering to handle exceptions more gracefully. - Modify temporary directory cleanup to use rmdir instead of remove for better directory management. - Adjust load_documents method to utilize asyncio.to_thread for lazy loading of documents. --- .../base/langflow/components/git/git.py | 94 ++++++++++--------- 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/src/backend/base/langflow/components/git/git.py b/src/backend/base/langflow/components/git/git.py index d311c3912bcf..223867525d08 100644 --- a/src/backend/base/langflow/components/git/git.py +++ b/src/backend/base/langflow/components/git/git.py @@ -2,8 +2,8 @@ import re import tempfile from contextlib import asynccontextmanager +from fnmatch import fnmatch from pathlib import Path -from typing import TYPE_CHECKING import anyio from langchain_community.document_loaders.git import GitLoader @@ -12,9 +12,6 @@ from langflow.io import DropdownInput, MessageTextInput, Output from langflow.schema import Data -if TYPE_CHECKING: - from collections.abc import Awaitable, Callable - class GitLoaderComponent(Component): display_name = "Git" @@ -98,7 +95,7 @@ async def temp_clone_dir(self): yield temp_dir finally: if temp_dir: - await anyio.Path(temp_dir).remove() + await anyio.Path(temp_dir).rmdir() def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict: # Hide fields by default @@ -121,43 +118,55 @@ async def build_gitloader(self) -> GitLoader: file_filter_patterns = getattr(self, "file_filter", None) content_filter_pattern = getattr(self, "content_filter", None) - file_filters: list[Callable[[Path], bool] | Callable[[Path], Awaitable[bool]]] = [] - if file_filter_patterns: - patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] - - def file_filter(file_path: Path) -> bool: - if len(patterns) == 1 and patterns[0].startswith("!"): - return not file_path.match(patterns[0][1:]) - included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) - excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) - return included and not excluded - - file_filters.append(file_filter) - - if content_filter_pattern: - content_regex = re.compile(content_filter_pattern) - - async def content_filter(file_path: Path) -> bool: - path = anyio.Path(file_path) - content = await path.read_text() - return bool(content_regex.search(content)) - - file_filters.append(content_filter) - - async def combined_filter(file_path: str) -> bool: - path = Path(file_path) - if await self.is_binary(file_path): + def is_binary(file_path: str | Path) -> bool: + """Check if a file is binary by looking for null bytes.""" + try: + with Path(file_path).open("rb") as file: + content = file.read(1024) + return b"\x00" in content + except Exception: # noqa: BLE001 + return True + + def combined_filter(file_path: str) -> bool: + try: + path = Path(file_path) + if is_binary(file_path): + return False + + # Apply file pattern filters + if file_filter_patterns: + patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] + path_str = str(path) + + # Handle single exclusion pattern + if len(patterns) == 1 and patterns[0].startswith("!"): + return not fnmatch(path_str, patterns[0][1:]) + + # Handle multiple patterns + included = any(fnmatch(path_str, pattern) for pattern in patterns if not pattern.startswith("!")) + excluded = any(fnmatch(path_str, pattern[1:]) for pattern in patterns if pattern.startswith("!")) + + # If no include patterns, treat as include all + if not any(not pattern.startswith("!") for pattern in patterns): + included = True + + if not included or excluded: + return False + + # Apply content filter + if content_filter_pattern: + try: + content_regex = re.compile(content_filter_pattern) + with Path(file_path).open() as file: + content = file.read() + if not content_regex.search(content): + return False + except (OSError, UnicodeDecodeError): + return False + + except Exception: # noqa: BLE001 return False - - results = [] - for f in file_filters: - if asyncio.iscoroutinefunction(f): - result = await f(path) - else: - result = f(path) - results.append(result) - - return all(results) if results else True + return True repo_source = getattr(self, "repo_source", None) if repo_source == "Local": @@ -178,7 +187,6 @@ async def combined_filter(file_path: str) -> bool: async def load_documents(self) -> list[Data]: gitloader = await self.build_gitloader() - documents = list(gitloader.lazy_load()) - data = [Data.from_document(doc) for doc in documents] + data = [Data.from_document(doc) for doc in await asyncio.to_thread(gitloader.lazy_load)] self.status = data return data