From f40ca8c9092c3c16262160b7064968e4ba10b378 Mon Sep 17 00:00:00 2001 From: paulpaliychuk Date: Tue, 10 Dec 2024 19:37:11 -0500 Subject: [PATCH 1/4] fix: Clean input before passing it to the llm --- graphiti_core/llm_client/client.py | 25 ++++++++++++++ graphiti_core/llm_client/openai_client.py | 1 + tests/llm_client/test_client.py | 41 +++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 tests/llm_client/test_client.py diff --git a/graphiti_core/llm_client/client.py b/graphiti_core/llm_client/client.py index 22cc3795..f2a19208 100644 --- a/graphiti_core/llm_client/client.py +++ b/graphiti_core/llm_client/client.py @@ -67,6 +67,28 @@ def __init__(self, config: LLMConfig | None, cache: bool = False): else None, reraise=True, ) + def _clean_input(self, input: str) -> str: + """Clean input string of invalid unicode and control characters. + + Args: + input: Raw input string to be cleaned + + Returns: + Cleaned string safe for LLM processing + """ + # Clean any invalid Unicode + cleaned = input.encode('utf-8', errors='ignore').decode('utf-8') + + # Remove zero-width characters and other invisible unicode + zero_width = '\u200b\u200c\u200d\ufeff\u2060' + for char in zero_width: + cleaned = cleaned.replace(char, '') + + # Remove control characters except newlines, returns, and tabs + cleaned = ''.join(char for char in cleaned if ord(char) >= 32 or char in '\n\r\t') + + return cleaned + async def _generate_response_with_retry( self, messages: list[Message], response_model: type[BaseModel] | None = None ) -> dict[str, typing.Any]: @@ -106,6 +128,9 @@ async def generate_response( logger.debug(f'Cache hit for {cache_key}') return cached_response + for message in messages: + message.content = self._clean_input(message.content) + response = await self._generate_response_with_retry(messages, response_model) if self.cache_enabled: diff --git a/graphiti_core/llm_client/openai_client.py b/graphiti_core/llm_client/openai_client.py index d8b02e8b..7804e06f 100644 --- a/graphiti_core/llm_client/openai_client.py +++ b/graphiti_core/llm_client/openai_client.py @@ -88,6 +88,7 @@ async def _generate_response( ) -> dict[str, typing.Any]: openai_messages: list[ChatCompletionMessageParam] = [] for m in messages: + m.content = self._clean_input(m.content) if m.role == 'user': openai_messages.append({'role': 'user', 'content': m.content}) elif m.role == 'system': diff --git a/tests/llm_client/test_client.py b/tests/llm_client/test_client.py new file mode 100644 index 00000000..4a2fbd7c --- /dev/null +++ b/tests/llm_client/test_client.py @@ -0,0 +1,41 @@ +from graphiti_core.llm_client.client import LLMClient +from graphiti_core.llm_client.config import LLMConfig + + +class TestLLMClient(LLMClient): + """Concrete implementation of LLMClient for testing""" + + async def _generate_response(self, messages, response_model=None): + return {'content': 'test'} + + +def test_clean_input(): + client = TestLLMClient(LLMConfig()) + + test_cases = [ + # Basic text should remain unchanged + ('Hello World', 'Hello World'), + # Control characters should be removed + ('Hello\x00World', 'HelloWorld'), + # Newlines, tabs, returns should be preserved + ('Hello\nWorld\tTest\r', 'Hello\nWorld\tTest\r'), + # Invalid Unicode should be removed + ('Hello\udcdeWorld', 'HelloWorld'), + # Zero-width characters should be removed + ('Hello\u200bWorld', 'HelloWorld'), + ('Test\ufeffWord', 'TestWord'), + # Multiple issues combined + ('Hello\x00\u200b\nWorld\udcde', 'Hello\nWorld'), + # Empty string should remain empty + ('', ''), + # Form feed and other control characters from the error case + ('{"edges":[{"relation_typ...\f\x04Hn\\?"}]}', '{"edges":[{"relation_typ...Hn\\?"}]}'), + # More specific control character tests + ('Hello\x0cWorld', 'HelloWorld'), # form feed \f + ('Hello\x04World', 'HelloWorld'), # end of transmission + # Combined JSON-like string with control characters + ('{"test": "value\f\x00\x04"}', '{"test": "value"}'), + ] + + for input_str, expected in test_cases: + assert client._clean_input(input_str) == expected, f'Failed for input: {repr(input_str)}' From 96e63778a98ed40f79e3a8eb0741ff43e72747b0 Mon Sep 17 00:00:00 2001 From: paulpaliychuk Date: Tue, 10 Dec 2024 19:39:20 -0500 Subject: [PATCH 2/4] chore: Add license --- tests/llm_client/test_client.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/llm_client/test_client.py b/tests/llm_client/test_client.py index 4a2fbd7c..2716395d 100644 --- a/tests/llm_client/test_client.py +++ b/tests/llm_client/test_client.py @@ -1,3 +1,19 @@ +""" +Copyright 2024, Zep Software, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + from graphiti_core.llm_client.client import LLMClient from graphiti_core.llm_client.config import LLMConfig From d8fa84e0e94e6e5a353747d2aafaeab4da0f7bc9 Mon Sep 17 00:00:00 2001 From: paulpaliychuk Date: Tue, 10 Dec 2024 19:41:51 -0500 Subject: [PATCH 3/4] fix: typo --- graphiti_core/llm_client/client.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/graphiti_core/llm_client/client.py b/graphiti_core/llm_client/client.py index f2a19208..ac01bed7 100644 --- a/graphiti_core/llm_client/client.py +++ b/graphiti_core/llm_client/client.py @@ -56,17 +56,7 @@ def __init__(self, config: LLMConfig | None, cache: bool = False): self.cache_enabled = cache self.cache_dir = Cache(DEFAULT_CACHE_DIR) # Create a cache directory - @retry( - stop=stop_after_attempt(4), - wait=wait_random_exponential(multiplier=10, min=5, max=120), - retry=retry_if_exception(is_server_or_retry_error), - after=lambda retry_state: logger.warning( - f'Retrying {retry_state.fn.__name__ if retry_state.fn else "function"} after {retry_state.attempt_number} attempts...' - ) - if retry_state.attempt_number > 1 - else None, - reraise=True, - ) + def _clean_input(self, input: str) -> str: """Clean input string of invalid unicode and control characters. @@ -89,6 +79,17 @@ def _clean_input(self, input: str) -> str: return cleaned + @retry( + stop=stop_after_attempt(4), + wait=wait_random_exponential(multiplier=10, min=5, max=120), + retry=retry_if_exception(is_server_or_retry_error), + after=lambda retry_state: logger.warning( + f'Retrying {retry_state.fn.__name__ if retry_state.fn else "function"} after {retry_state.attempt_number} attempts...' + ) + if retry_state.attempt_number > 1 + else None, + reraise=True, + ) async def _generate_response_with_retry( self, messages: list[Message], response_model: type[BaseModel] | None = None ) -> dict[str, typing.Any]: From e852493a0d1e5b7d405515e63461d0e8290f778e Mon Sep 17 00:00:00 2001 From: paulpaliychuk Date: Tue, 10 Dec 2024 21:23:57 -0500 Subject: [PATCH 4/4] chore: Bump graphiti version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f67fc081..88e2b7ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "graphiti-core" -version = "0.5.0pre4" +version = "0.5.0pre5" description = "A temporal graph building library" authors = [ "Paul Paliychuk ",