From f40ca8c9092c3c16262160b7064968e4ba10b378 Mon Sep 17 00:00:00 2001
From: paulpaliychuk <pavlo.paliychuk.ca@gmail.com>
Date: Tue, 10 Dec 2024 19:37:11 -0500
Subject: [PATCH 1/4] fix: Clean input before passing it to the llm

---
 graphiti_core/llm_client/client.py        | 25 ++++++++++++++
 graphiti_core/llm_client/openai_client.py |  1 +
 tests/llm_client/test_client.py           | 41 +++++++++++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100644 tests/llm_client/test_client.py

diff --git a/graphiti_core/llm_client/client.py b/graphiti_core/llm_client/client.py
index 22cc3795..f2a19208 100644
--- a/graphiti_core/llm_client/client.py
+++ b/graphiti_core/llm_client/client.py
@@ -67,6 +67,28 @@ def __init__(self, config: LLMConfig | None, cache: bool = False):
         else None,
         reraise=True,
     )
+    def _clean_input(self, input: str) -> str:
+        """Clean input string of invalid unicode and control characters.
+
+        Args:
+            input: Raw input string to be cleaned
+
+        Returns:
+            Cleaned string safe for LLM processing
+        """
+        # Clean any invalid Unicode
+        cleaned = input.encode('utf-8', errors='ignore').decode('utf-8')
+
+        # Remove zero-width characters and other invisible unicode
+        zero_width = '\u200b\u200c\u200d\ufeff\u2060'
+        for char in zero_width:
+            cleaned = cleaned.replace(char, '')
+
+        # Remove control characters except newlines, returns, and tabs
+        cleaned = ''.join(char for char in cleaned if ord(char) >= 32 or char in '\n\r\t')
+
+        return cleaned
+
     async def _generate_response_with_retry(
         self, messages: list[Message], response_model: type[BaseModel] | None = None
     ) -> dict[str, typing.Any]:
@@ -106,6 +128,9 @@ async def generate_response(
                 logger.debug(f'Cache hit for {cache_key}')
                 return cached_response
 
+        for message in messages:
+            message.content = self._clean_input(message.content)
+
         response = await self._generate_response_with_retry(messages, response_model)
 
         if self.cache_enabled:
diff --git a/graphiti_core/llm_client/openai_client.py b/graphiti_core/llm_client/openai_client.py
index d8b02e8b..7804e06f 100644
--- a/graphiti_core/llm_client/openai_client.py
+++ b/graphiti_core/llm_client/openai_client.py
@@ -88,6 +88,7 @@ async def _generate_response(
     ) -> dict[str, typing.Any]:
         openai_messages: list[ChatCompletionMessageParam] = []
         for m in messages:
+            m.content = self._clean_input(m.content)
             if m.role == 'user':
                 openai_messages.append({'role': 'user', 'content': m.content})
             elif m.role == 'system':
diff --git a/tests/llm_client/test_client.py b/tests/llm_client/test_client.py
new file mode 100644
index 00000000..4a2fbd7c
--- /dev/null
+++ b/tests/llm_client/test_client.py
@@ -0,0 +1,41 @@
+from graphiti_core.llm_client.client import LLMClient
+from graphiti_core.llm_client.config import LLMConfig
+
+
+class TestLLMClient(LLMClient):
+    """Concrete implementation of LLMClient for testing"""
+
+    async def _generate_response(self, messages, response_model=None):
+        return {'content': 'test'}
+
+
+def test_clean_input():
+    client = TestLLMClient(LLMConfig())
+
+    test_cases = [
+        # Basic text should remain unchanged
+        ('Hello World', 'Hello World'),
+        # Control characters should be removed
+        ('Hello\x00World', 'HelloWorld'),
+        # Newlines, tabs, returns should be preserved
+        ('Hello\nWorld\tTest\r', 'Hello\nWorld\tTest\r'),
+        # Invalid Unicode should be removed
+        ('Hello\udcdeWorld', 'HelloWorld'),
+        # Zero-width characters should be removed
+        ('Hello\u200bWorld', 'HelloWorld'),
+        ('Test\ufeffWord', 'TestWord'),
+        # Multiple issues combined
+        ('Hello\x00\u200b\nWorld\udcde', 'Hello\nWorld'),
+        # Empty string should remain empty
+        ('', ''),
+        # Form feed and other control characters from the error case
+        ('{"edges":[{"relation_typ...\f\x04Hn\\?"}]}', '{"edges":[{"relation_typ...Hn\\?"}]}'),
+        # More specific control character tests
+        ('Hello\x0cWorld', 'HelloWorld'),  # form feed \f
+        ('Hello\x04World', 'HelloWorld'),  # end of transmission
+        # Combined JSON-like string with control characters
+        ('{"test": "value\f\x00\x04"}', '{"test": "value"}'),
+    ]
+
+    for input_str, expected in test_cases:
+        assert client._clean_input(input_str) == expected, f'Failed for input: {repr(input_str)}'

From 96e63778a98ed40f79e3a8eb0741ff43e72747b0 Mon Sep 17 00:00:00 2001
From: paulpaliychuk <pavlo.paliychuk.ca@gmail.com>
Date: Tue, 10 Dec 2024 19:39:20 -0500
Subject: [PATCH 2/4] chore: Add license

---
 tests/llm_client/test_client.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/llm_client/test_client.py b/tests/llm_client/test_client.py
index 4a2fbd7c..2716395d 100644
--- a/tests/llm_client/test_client.py
+++ b/tests/llm_client/test_client.py
@@ -1,3 +1,19 @@
+"""
+Copyright 2024, Zep Software, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
 from graphiti_core.llm_client.client import LLMClient
 from graphiti_core.llm_client.config import LLMConfig
 

From d8fa84e0e94e6e5a353747d2aafaeab4da0f7bc9 Mon Sep 17 00:00:00 2001
From: paulpaliychuk <pavlo.paliychuk.ca@gmail.com>
Date: Tue, 10 Dec 2024 19:41:51 -0500
Subject: [PATCH 3/4] fix: typo

---
 graphiti_core/llm_client/client.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/graphiti_core/llm_client/client.py b/graphiti_core/llm_client/client.py
index f2a19208..ac01bed7 100644
--- a/graphiti_core/llm_client/client.py
+++ b/graphiti_core/llm_client/client.py
@@ -56,17 +56,7 @@ def __init__(self, config: LLMConfig | None, cache: bool = False):
         self.cache_enabled = cache
         self.cache_dir = Cache(DEFAULT_CACHE_DIR)  # Create a cache directory
 
-    @retry(
-        stop=stop_after_attempt(4),
-        wait=wait_random_exponential(multiplier=10, min=5, max=120),
-        retry=retry_if_exception(is_server_or_retry_error),
-        after=lambda retry_state: logger.warning(
-            f'Retrying {retry_state.fn.__name__ if retry_state.fn else "function"} after {retry_state.attempt_number} attempts...'
-        )
-        if retry_state.attempt_number > 1
-        else None,
-        reraise=True,
-    )
+    
     def _clean_input(self, input: str) -> str:
         """Clean input string of invalid unicode and control characters.
 
@@ -89,6 +79,17 @@ def _clean_input(self, input: str) -> str:
 
         return cleaned
 
+    @retry(
+        stop=stop_after_attempt(4),
+        wait=wait_random_exponential(multiplier=10, min=5, max=120),
+        retry=retry_if_exception(is_server_or_retry_error),
+        after=lambda retry_state: logger.warning(
+            f'Retrying {retry_state.fn.__name__ if retry_state.fn else "function"} after {retry_state.attempt_number} attempts...'
+        )
+        if retry_state.attempt_number > 1
+        else None,
+        reraise=True,
+    )
     async def _generate_response_with_retry(
         self, messages: list[Message], response_model: type[BaseModel] | None = None
     ) -> dict[str, typing.Any]:

From e852493a0d1e5b7d405515e63461d0e8290f778e Mon Sep 17 00:00:00 2001
From: paulpaliychuk <pavlo.paliychuk.ca@gmail.com>
Date: Tue, 10 Dec 2024 21:23:57 -0500
Subject: [PATCH 4/4] chore: Bump graphiti version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f67fc081..88e2b7ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "graphiti-core"
-version = "0.5.0pre4"
+version = "0.5.0pre5"
 description = "A temporal graph building library"
 authors = [
     "Paul Paliychuk <paul@getzep.com>",