Skip to content

Commit

Permalink
refactor(git.py): improve pattern handling and content filtering in G…
Browse files Browse the repository at this point in the history
…itLoaderComponent

- Refactored pattern handling to use a more descriptive variable name `pattern_list` for clarity.
- Enhanced content filtering by ensuring proper encoding when reading file content.
- Updated regex validation to include a test string check for better error handling.
- Removed unnecessary comments to streamline the code and improve readability.
  • Loading branch information
ogabrielluiz committed Dec 20, 2024
1 parent cf3dc29 commit ffbd2ea
Showing 1 changed file with 11 additions and 9 deletions.
20 changes: 11 additions & 9 deletions src/backend/base/langflow/components/git/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,22 +105,22 @@ def check_file_patterns(file_path: str | Path, patterns: str) -> bool:

path_str = str(file_path)
file_name = Path(path_str).name
patterns = [pattern.strip() for pattern in patterns.split(",") if pattern.strip()]
pattern_list: list[str] = [pattern.strip() for pattern in patterns.split(",") if pattern.strip()]

# If no valid patterns after stripping, treat as include all
if not patterns:
if not pattern_list:
return True

# Process exclusion patterns first
for pattern in patterns:
for pattern in pattern_list:
if pattern.startswith("!"):
# For exclusions, match against both full path and filename
exclude_pattern = pattern[1:]
if fnmatch(path_str, exclude_pattern) or fnmatch(file_name, exclude_pattern):
return False

# Then check inclusion patterns
include_patterns = [p for p in patterns if not p.startswith("!")]
include_patterns = [p for p in pattern_list if not p.startswith("!")]
# If no include patterns, treat as include all
if not include_patterns:
return True
Expand Down Expand Up @@ -151,14 +151,17 @@ def check_content_pattern(file_path: str | Path, pattern: str) -> bool:
# Use the MULTILINE flag to better handle text content
content_regex = re.compile(pattern, re.MULTILINE)
# Test the pattern with a simple string to catch syntax errors
content_regex.search("test\nstring")
test_str = "test\nstring"
if not content_regex.search(test_str):
# Pattern is valid but doesn't match test string
pass
except (re.error, TypeError, ValueError):
return False

# If not binary and regex is valid, check content
with Path(file_path).open() as file:
content = file.read()
return bool(content_regex.search(content))
with Path(file_path).open(encoding="utf-8") as file:
file_content = file.read()
return bool(content_regex.search(file_content))
except (OSError, UnicodeDecodeError):
return False

Expand Down Expand Up @@ -254,7 +257,6 @@ async def build_gitloader(self) -> GitLoader:

async def load_documents(self) -> list[Data]:
gitloader = await self.build_gitloader()
# Run lazy_load in a separate thread to avoid blocking
data = [Data.from_document(doc) for doc in await gitloader.alazy_load()]
self.status = data
return data

0 comments on commit ffbd2ea

Please sign in to comment.