diff --git a/LangChain.sln b/LangChain.sln index b6140db4..576b6b56 100644 --- a/LangChain.sln +++ b/LangChain.sln @@ -123,6 +123,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Databases.Kendra" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Providers.Anyscale", "src\libs\Providers\LangChain.Providers.Anyscale\LangChain.Providers.Anyscale.csproj", "{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.Pdf.IntegrationTests", "src\tests\LangChain.Sources.Pdf.IntegrationTests\LangChain.Sources.Pdf.IntegrationTests.csproj", "{324183A0-92F9-44C7-A919-AEC164C50EA6}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.WebBase", "src\libs\Sources\LangChain.Sources.WebBase\LangChain.Sources.WebBase.csproj", "{01DC2D34-958F-4381-99AD-E91E3CEE31FD}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.WebBase.IntegrationTests", "src\tests\LangChain.Sources.WebBase.IntegrationTests\LangChain.Sources.WebBase.IntegrationTests.csproj", "{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -277,6 +283,18 @@ Global {A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Debug|Any CPU.Build.0 = Debug|Any CPU {A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Release|Any CPU.ActiveCfg = Release|Any CPU {A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Release|Any CPU.Build.0 = Release|Any CPU + {324183A0-92F9-44C7-A919-AEC164C50EA6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {324183A0-92F9-44C7-A919-AEC164C50EA6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {324183A0-92F9-44C7-A919-AEC164C50EA6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {324183A0-92F9-44C7-A919-AEC164C50EA6}.Release|Any CPU.Build.0 = Release|Any CPU + {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Debug|Any CPU.Build.0 = Debug|Any CPU + {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Release|Any CPU.ActiveCfg = Release|Any CPU + {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Release|Any CPU.Build.0 = Release|Any CPU + {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Debug|Any CPU.Build.0 = Debug|Any CPU + {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Release|Any CPU.ActiveCfg = Release|Any CPU + {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(NestedProjects) = preSolution {E55391DE-F8F3-4CC2-A0E3-2406C76E9C68} = {EB6F52EE-7E7E-4624-A99E-79D3F190F386} @@ -321,5 +339,8 @@ Global {233DF40E-6459-41F7-AEAE-C32398F474DF} = {F17A86AE-A174-4B6C-BAA7-9D9A9704BE85} {B456F59B-4DE6-4BC9-B83B-CD796985DD98} = {C58D122C-808F-43F9-BB23-4E517046F533} {A4946307-A1B5-4F85-9C5A-31BDAE38D24E} = {E55391DE-F8F3-4CC2-A0E3-2406C76E9C68} + {324183A0-92F9-44C7-A919-AEC164C50EA6} = {FDEE2E22-C239-4921-83B2-9797F765FD6A} + {01DC2D34-958F-4381-99AD-E91E3CEE31FD} = {7F35205F-1692-4702-AA88-3C29BBB121BC} + {454BA81E-861D-4908-B4D3-D1F2CDEF2C81} = {FDEE2E22-C239-4921-83B2-9797F765FD6A} EndGlobalSection EndGlobal diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index aaf6440e..ebbbb1e3 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -3,6 +3,7 @@ true + @@ -29,6 +30,7 @@ + diff --git a/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj b/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj index c2c9cc9e..6229a661 100644 --- a/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj +++ b/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj @@ -9,6 +9,10 @@ $(PackageTags);di + + + + diff --git a/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj b/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj index d42e44d8..e10326d6 100644 --- a/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj +++ b/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj @@ -6,6 +6,7 @@ + diff --git a/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj b/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj index f16ae8cf..840d4d87 100644 --- a/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj +++ b/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj @@ -6,6 +6,7 @@ + diff --git a/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj b/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj index e779d4e0..4db4cacc 100644 --- a/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj +++ b/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj @@ -6,6 +6,7 @@ + diff --git a/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj b/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj index 36bb9f56..8fedfa00 100644 --- a/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj +++ b/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj @@ -6,6 +6,7 @@ + diff --git a/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj b/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj index 1c596623..3ab4ac93 100644 --- a/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj +++ b/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj @@ -6,6 +6,7 @@ + diff --git a/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj b/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj index 6abebbd2..1e23a936 100644 --- a/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj +++ b/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj @@ -6,6 +6,7 @@ + diff --git a/src/libs/Sources/Directory.Build.props b/src/libs/Sources/Directory.Build.props index f40b8516..f1b7f0b0 100644 --- a/src/libs/Sources/Directory.Build.props +++ b/src/libs/Sources/Directory.Build.props @@ -3,7 +3,7 @@ - LangChain.Sources + LangChain.DocumentLoaders diff --git a/src/libs/Sources/LangChain.Sources.Pdf/PdfSource.cs b/src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs similarity index 95% rename from src/libs/Sources/LangChain.Sources.Pdf/PdfSource.cs rename to src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs index abf360c4..bc8a4c19 100644 --- a/src/libs/Sources/LangChain.Sources.Pdf/PdfSource.cs +++ b/src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs @@ -5,7 +5,7 @@ namespace LangChain.Sources; /// /// /// -public class PdfSource : ISource +public class AsposePdfSource : ISource { /// /// diff --git a/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj b/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj index fdb372d1..d57f7d9e 100644 --- a/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj +++ b/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj @@ -21,6 +21,7 @@ + diff --git a/src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs b/src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs new file mode 100644 index 00000000..94a77f86 --- /dev/null +++ b/src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs @@ -0,0 +1,36 @@ +using UglyToad.PdfPig; + +namespace LangChain.Sources; + +/// +/// +/// +public class PdfPigPdfSource : ISource +{ + /// + /// + /// + public required string Path { get; init; } + + /// + public Task> LoadAsync(CancellationToken cancellationToken = default) + { + try + { + using PdfDocument document = PdfDocument.Open(Path, new ParsingOptions()); + var pages = document.GetPages(); + var content = String.Join("\n\n", pages.Select(page => page.Text)); + + var documents = (Document.Empty with + { + Content = content, + }).AsArray(); + + return Task.FromResult(documents); + } + catch (Exception exception) + { + return Task.FromException>(exception); + } + } +} \ No newline at end of file diff --git a/src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj b/src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj new file mode 100644 index 00000000..7af46131 --- /dev/null +++ b/src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj @@ -0,0 +1,25 @@ + + + + net4.6.2;netstandard2.0;net6.0;net7.0 + $(NoWarn);CA1031 + + + + + + + + LangChain web document source. + $(PackageTags);pdf + + + + + + + + + + + diff --git a/src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs b/src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs new file mode 100644 index 00000000..4028db65 --- /dev/null +++ b/src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs @@ -0,0 +1,56 @@ +using AngleSharp; + +namespace LangChain.Sources; + +/// +/// +/// +public class WebBaseSource : ISource +{ + /// + /// + /// + public required string Url { get; init; } + + /// + public Task> LoadAsync(CancellationToken cancellationToken = default) + { + try + { + return LoadCoreAsync(Url); + } + catch (Exception exception) + { + return Task.FromException>(exception); + } + } + + protected async Task> LoadCoreAsync(string url) + { + var config = Configuration.Default.WithDefaultLoader(); + var context = BrowsingContext.New(config); + var document = await context.OpenAsync(url).ConfigureAwait(false); + + foreach (var element in document.QuerySelectorAll("script, style, meta, link")) + { + element.Remove(); + } + + string content; + var html = document.QuerySelector("html"); + + if (html == null) + { + throw new NotSupportedException("Not supported for pages without tag"); + } + + content = html.TextContent; + + var documents = (Document.Empty with + { + Content = content + }).AsArray(); + + return documents; + } +} \ No newline at end of file diff --git a/src/tests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj b/src/tests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj new file mode 100644 index 00000000..75b96205 --- /dev/null +++ b/src/tests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj @@ -0,0 +1,17 @@ + + + + net7.0 + + + + + + + + + PreserveNewest + + + + diff --git a/src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs b/src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs new file mode 100644 index 00000000..50166dd8 --- /dev/null +++ b/src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs @@ -0,0 +1,25 @@ +namespace LangChain.Sources.Pdf.IntegrationTests; + +[TestClass] +public class PdfSourceTests +{ + [TestMethod] + public async Task PdfPig_CheckText() + { + var loader = new PdfPigPdfSource + { + Path = "sample.pdf" + }; + + var documents = await loader.LoadAsync(); + + documents.Should().NotBeEmpty(); + var first = documents.First(); + + // check text from page 1 + first.Content.Should().Contain("A Simple PDF File"); + + // check text from page 2 + first.Content.Should().Contain("Simple PDF File 2"); + } +} \ No newline at end of file diff --git a/src/tests/LangChain.Sources.Pdf.IntegrationTests/sample.pdf b/src/tests/LangChain.Sources.Pdf.IntegrationTests/sample.pdf new file mode 100644 index 00000000..dbf091df Binary files /dev/null and b/src/tests/LangChain.Sources.Pdf.IntegrationTests/sample.pdf differ diff --git a/src/tests/LangChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj b/src/tests/LangChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj new file mode 100644 index 00000000..9aec5ae2 --- /dev/null +++ b/src/tests/LangChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj @@ -0,0 +1,11 @@ + + + + net7.0 + + + + + + + diff --git a/src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs b/src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs new file mode 100644 index 00000000..6f02f7f0 --- /dev/null +++ b/src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs @@ -0,0 +1,22 @@ +namespace LangChain.Sources.WebBase.IntegrationTests; + +[TestClass] +public class WebBaseSourceTests +{ + [TestMethod] + public async Task CheckText() + { + var loader = new WebBaseSource + { + Url = "https://en.wikipedia.org/wiki/Web_scraping" + }; + + var documents = await loader.LoadAsync(); + + documents.Should().NotBeEmpty(); + var first = documents.First(); + + first.Content.Should().Contain("Web scraping, web harvesting, or web data extraction is"); + first.Content.Should().Contain("This page was last edited on"); + } +} \ No newline at end of file