diff --git a/LangChain.sln b/LangChain.sln
index b6140db4..576b6b56 100644
--- a/LangChain.sln
+++ b/LangChain.sln
@@ -123,6 +123,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Databases.Kendra"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Providers.Anyscale", "src\libs\Providers\LangChain.Providers.Anyscale\LangChain.Providers.Anyscale.csproj", "{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.Pdf.IntegrationTests", "src\tests\LangChain.Sources.Pdf.IntegrationTests\LangChain.Sources.Pdf.IntegrationTests.csproj", "{324183A0-92F9-44C7-A919-AEC164C50EA6}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.WebBase", "src\libs\Sources\LangChain.Sources.WebBase\LangChain.Sources.WebBase.csproj", "{01DC2D34-958F-4381-99AD-E91E3CEE31FD}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.WebBase.IntegrationTests", "src\tests\LangChain.Sources.WebBase.IntegrationTests\LangChain.Sources.WebBase.IntegrationTests.csproj", "{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -277,6 +283,18 @@ Global
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Release|Any CPU.Build.0 = Release|Any CPU
+ {324183A0-92F9-44C7-A919-AEC164C50EA6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {324183A0-92F9-44C7-A919-AEC164C50EA6}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {324183A0-92F9-44C7-A919-AEC164C50EA6}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {324183A0-92F9-44C7-A919-AEC164C50EA6}.Release|Any CPU.Build.0 = Release|Any CPU
+ {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Release|Any CPU.Build.0 = Release|Any CPU
+ {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{E55391DE-F8F3-4CC2-A0E3-2406C76E9C68} = {EB6F52EE-7E7E-4624-A99E-79D3F190F386}
@@ -321,5 +339,8 @@ Global
{233DF40E-6459-41F7-AEAE-C32398F474DF} = {F17A86AE-A174-4B6C-BAA7-9D9A9704BE85}
{B456F59B-4DE6-4BC9-B83B-CD796985DD98} = {C58D122C-808F-43F9-BB23-4E517046F533}
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E} = {E55391DE-F8F3-4CC2-A0E3-2406C76E9C68}
+ {324183A0-92F9-44C7-A919-AEC164C50EA6} = {FDEE2E22-C239-4921-83B2-9797F765FD6A}
+ {01DC2D34-958F-4381-99AD-E91E3CEE31FD} = {7F35205F-1692-4702-AA88-3C29BBB121BC}
+ {454BA81E-861D-4908-B4D3-D1F2CDEF2C81} = {FDEE2E22-C239-4921-83B2-9797F765FD6A}
EndGlobalSection
EndGlobal
diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props
index aaf6440e..ebbbb1e3 100644
--- a/src/Directory.Packages.props
+++ b/src/Directory.Packages.props
@@ -3,6 +3,7 @@
true
+
@@ -29,6 +30,7 @@
+
diff --git a/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj b/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj
index c2c9cc9e..6229a661 100644
--- a/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj
+++ b/src/libs/Extensions/LangChain.Extensions.DependencyInjection/LangChain.Extensions.DependencyInjection.csproj
@@ -9,6 +9,10 @@
$(PackageTags);di
+
+
+
+
diff --git a/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj b/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj
index d42e44d8..e10326d6 100644
--- a/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj
+++ b/src/libs/Providers/LangChain.Providers.Anthropic/LangChain.Providers.Anthropic.csproj
@@ -6,6 +6,7 @@
+
diff --git a/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj b/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj
index f16ae8cf..840d4d87 100644
--- a/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj
+++ b/src/libs/Providers/LangChain.Providers.Anyscale/LangChain.Providers.Anyscale.csproj
@@ -6,6 +6,7 @@
+
diff --git a/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj b/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj
index e779d4e0..4db4cacc 100644
--- a/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj
+++ b/src/libs/Providers/LangChain.Providers.Azure/LangChain.Providers.Azure.csproj
@@ -6,6 +6,7 @@
+
diff --git a/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj b/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj
index 36bb9f56..8fedfa00 100644
--- a/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj
+++ b/src/libs/Providers/LangChain.Providers.HuggingFace/LangChain.Providers.HuggingFace.csproj
@@ -6,6 +6,7 @@
+
diff --git a/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj b/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj
index 1c596623..3ab4ac93 100644
--- a/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj
+++ b/src/libs/Providers/LangChain.Providers.LeonardoAi/LangChain.Providers.LeonardoAi.csproj
@@ -6,6 +6,7 @@
+
diff --git a/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj b/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj
index 6abebbd2..1e23a936 100644
--- a/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj
+++ b/src/libs/Providers/LangChain.Providers.OpenAI/LangChain.Providers.OpenAI.csproj
@@ -6,6 +6,7 @@
+
diff --git a/src/libs/Sources/Directory.Build.props b/src/libs/Sources/Directory.Build.props
index f40b8516..f1b7f0b0 100644
--- a/src/libs/Sources/Directory.Build.props
+++ b/src/libs/Sources/Directory.Build.props
@@ -3,7 +3,7 @@
- LangChain.Sources
+ LangChain.DocumentLoaders
diff --git a/src/libs/Sources/LangChain.Sources.Pdf/PdfSource.cs b/src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs
similarity index 95%
rename from src/libs/Sources/LangChain.Sources.Pdf/PdfSource.cs
rename to src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs
index abf360c4..bc8a4c19 100644
--- a/src/libs/Sources/LangChain.Sources.Pdf/PdfSource.cs
+++ b/src/libs/Sources/LangChain.Sources.Pdf/AsposePdfSource.cs
@@ -5,7 +5,7 @@ namespace LangChain.Sources;
///
///
///
-public class PdfSource : ISource
+public class AsposePdfSource : ISource
{
///
///
diff --git a/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj b/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj
index fdb372d1..d57f7d9e 100644
--- a/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj
+++ b/src/libs/Sources/LangChain.Sources.Pdf/LangChain.Sources.Pdf.csproj
@@ -21,6 +21,7 @@
+
diff --git a/src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs b/src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs
new file mode 100644
index 00000000..94a77f86
--- /dev/null
+++ b/src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs
@@ -0,0 +1,36 @@
+using UglyToad.PdfPig;
+
+namespace LangChain.Sources;
+
+///
+///
+///
+public class PdfPigPdfSource : ISource
+{
+ ///
+ ///
+ ///
+ public required string Path { get; init; }
+
+ ///
+ public Task> LoadAsync(CancellationToken cancellationToken = default)
+ {
+ try
+ {
+ using PdfDocument document = PdfDocument.Open(Path, new ParsingOptions());
+ var pages = document.GetPages();
+ var content = String.Join("\n\n", pages.Select(page => page.Text));
+
+ var documents = (Document.Empty with
+ {
+ Content = content,
+ }).AsArray();
+
+ return Task.FromResult(documents);
+ }
+ catch (Exception exception)
+ {
+ return Task.FromException>(exception);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj b/src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj
new file mode 100644
index 00000000..7af46131
--- /dev/null
+++ b/src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj
@@ -0,0 +1,25 @@
+
+
+
+ net4.6.2;netstandard2.0;net6.0;net7.0
+ $(NoWarn);CA1031
+
+
+
+
+
+
+
+ LangChain web document source.
+ $(PackageTags);pdf
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs b/src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs
new file mode 100644
index 00000000..4028db65
--- /dev/null
+++ b/src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs
@@ -0,0 +1,56 @@
+using AngleSharp;
+
+namespace LangChain.Sources;
+
+///
+///
+///
+public class WebBaseSource : ISource
+{
+ ///
+ ///
+ ///
+ public required string Url { get; init; }
+
+ ///
+ public Task> LoadAsync(CancellationToken cancellationToken = default)
+ {
+ try
+ {
+ return LoadCoreAsync(Url);
+ }
+ catch (Exception exception)
+ {
+ return Task.FromException>(exception);
+ }
+ }
+
+ protected async Task> LoadCoreAsync(string url)
+ {
+ var config = Configuration.Default.WithDefaultLoader();
+ var context = BrowsingContext.New(config);
+ var document = await context.OpenAsync(url).ConfigureAwait(false);
+
+ foreach (var element in document.QuerySelectorAll("script, style, meta, link"))
+ {
+ element.Remove();
+ }
+
+ string content;
+ var html = document.QuerySelector("html");
+
+ if (html == null)
+ {
+ throw new NotSupportedException("Not supported for pages without tag");
+ }
+
+ content = html.TextContent;
+
+ var documents = (Document.Empty with
+ {
+ Content = content
+ }).AsArray();
+
+ return documents;
+ }
+}
\ No newline at end of file
diff --git a/src/tests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj b/src/tests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj
new file mode 100644
index 00000000..75b96205
--- /dev/null
+++ b/src/tests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj
@@ -0,0 +1,17 @@
+
+
+
+ net7.0
+
+
+
+
+
+
+
+
+ PreserveNewest
+
+
+
+
diff --git a/src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs b/src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs
new file mode 100644
index 00000000..50166dd8
--- /dev/null
+++ b/src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs
@@ -0,0 +1,25 @@
+namespace LangChain.Sources.Pdf.IntegrationTests;
+
+[TestClass]
+public class PdfSourceTests
+{
+ [TestMethod]
+ public async Task PdfPig_CheckText()
+ {
+ var loader = new PdfPigPdfSource
+ {
+ Path = "sample.pdf"
+ };
+
+ var documents = await loader.LoadAsync();
+
+ documents.Should().NotBeEmpty();
+ var first = documents.First();
+
+ // check text from page 1
+ first.Content.Should().Contain("A Simple PDF File");
+
+ // check text from page 2
+ first.Content.Should().Contain("Simple PDF File 2");
+ }
+}
\ No newline at end of file
diff --git a/src/tests/LangChain.Sources.Pdf.IntegrationTests/sample.pdf b/src/tests/LangChain.Sources.Pdf.IntegrationTests/sample.pdf
new file mode 100644
index 00000000..dbf091df
Binary files /dev/null and b/src/tests/LangChain.Sources.Pdf.IntegrationTests/sample.pdf differ
diff --git a/src/tests/LangChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj b/src/tests/LangChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj
new file mode 100644
index 00000000..9aec5ae2
--- /dev/null
+++ b/src/tests/LangChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj
@@ -0,0 +1,11 @@
+
+
+
+ net7.0
+
+
+
+
+
+
+
diff --git a/src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs b/src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs
new file mode 100644
index 00000000..6f02f7f0
--- /dev/null
+++ b/src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs
@@ -0,0 +1,22 @@
+namespace LangChain.Sources.WebBase.IntegrationTests;
+
+[TestClass]
+public class WebBaseSourceTests
+{
+ [TestMethod]
+ public async Task CheckText()
+ {
+ var loader = new WebBaseSource
+ {
+ Url = "https://en.wikipedia.org/wiki/Web_scraping"
+ };
+
+ var documents = await loader.LoadAsync();
+
+ documents.Should().NotBeEmpty();
+ var first = documents.First();
+
+ first.Content.Should().Contain("Web scraping, web harvesting, or web data extraction is");
+ first.Content.Should().Contain("This page was last edited on");
+ }
+}
\ No newline at end of file