Skip to content

Commit

Permalink
feat: Added Web and PDF loaders (#39)
Browse files Browse the repository at this point in the history
* WebBaseLoader & opernsouce pdf lib

---------

Co-authored-by: Evgenii Khoroshev <[email protected]>
  • Loading branch information
khoroshevj and Evgenii Khoroshev authored Nov 1, 2023
1 parent 8a50d69 commit e3ac3ca
Show file tree
Hide file tree
Showing 20 changed files with 228 additions and 2 deletions.
21 changes: 21 additions & 0 deletions LangChain.sln
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Databases.Kendra"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Providers.Anyscale", "src\libs\Providers\LangChain.Providers.Anyscale\LangChain.Providers.Anyscale.csproj", "{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.Pdf.IntegrationTests", "src\tests\LangChain.Sources.Pdf.IntegrationTests\LangChain.Sources.Pdf.IntegrationTests.csproj", "{324183A0-92F9-44C7-A919-AEC164C50EA6}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.WebBase", "src\libs\Sources\LangChain.Sources.WebBase\LangChain.Sources.WebBase.csproj", "{01DC2D34-958F-4381-99AD-E91E3CEE31FD}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LangChain.Sources.WebBase.IntegrationTests", "src\tests\LangChain.Sources.WebBase.IntegrationTests\LangChain.Sources.WebBase.IntegrationTests.csproj", "{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -277,6 +283,18 @@ Global
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E}.Release|Any CPU.Build.0 = Release|Any CPU
{324183A0-92F9-44C7-A919-AEC164C50EA6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{324183A0-92F9-44C7-A919-AEC164C50EA6}.Debug|Any CPU.Build.0 = Debug|Any CPU
{324183A0-92F9-44C7-A919-AEC164C50EA6}.Release|Any CPU.ActiveCfg = Release|Any CPU
{324183A0-92F9-44C7-A919-AEC164C50EA6}.Release|Any CPU.Build.0 = Release|Any CPU
{01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Debug|Any CPU.Build.0 = Debug|Any CPU
{01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Release|Any CPU.ActiveCfg = Release|Any CPU
{01DC2D34-958F-4381-99AD-E91E3CEE31FD}.Release|Any CPU.Build.0 = Release|Any CPU
{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Debug|Any CPU.Build.0 = Debug|Any CPU
{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Release|Any CPU.ActiveCfg = Release|Any CPU
{454BA81E-861D-4908-B4D3-D1F2CDEF2C81}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{E55391DE-F8F3-4CC2-A0E3-2406C76E9C68} = {EB6F52EE-7E7E-4624-A99E-79D3F190F386}
Expand Down Expand Up @@ -321,5 +339,8 @@ Global
{233DF40E-6459-41F7-AEAE-C32398F474DF} = {F17A86AE-A174-4B6C-BAA7-9D9A9704BE85}
{B456F59B-4DE6-4BC9-B83B-CD796985DD98} = {C58D122C-808F-43F9-BB23-4E517046F533}
{A4946307-A1B5-4F85-9C5A-31BDAE38D24E} = {E55391DE-F8F3-4CC2-A0E3-2406C76E9C68}
{324183A0-92F9-44C7-A919-AEC164C50EA6} = {FDEE2E22-C239-4921-83B2-9797F765FD6A}
{01DC2D34-958F-4381-99AD-E91E3CEE31FD} = {7F35205F-1692-4702-AA88-3C29BBB121BC}
{454BA81E-861D-4908-B4D3-D1F2CDEF2C81} = {FDEE2E22-C239-4921-83B2-9797F765FD6A}
EndGlobalSection
EndGlobal
2 changes: 2 additions & 0 deletions src/Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
</PropertyGroup>
<ItemGroup>
<PackageVersion Include="AngleSharp" Version="1.0.5" />
<PackageVersion Include="Anthropic" Version="0.3.0" />
<PackageVersion Include="Anyscale" Version="1.0.2" />
<PackageVersion Include="Aspose.PDF" Version="23.10.0" />
Expand All @@ -29,6 +30,7 @@
<PackageVersion Include="Moq" Version="4.20.69" />
<PackageVersion Include="MSTest.TestAdapter" Version="3.1.1" />
<PackageVersion Include="MSTest.TestFramework" Version="3.1.1" />
<PackageVersion Include="PdfPig" Version="0.1.9-alpha-20231029-17d50" />
<PackageVersion Include="PolySharp" Version="1.13.2" />
<PackageVersion Include="System.Text.Json" Version="7.0.3" />
<PackageVersion Include="Tiktoken" Version="1.1.3" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
<PackageTags>$(PackageTags);di</PackageTags>
</PropertyGroup>

<ItemGroup Label="Usings">
<Using Include="System.Net.Http" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Http" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

<ItemGroup Label="Usings">
<Using Include="Anthropic" />
<Using Include="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

<ItemGroup Label="Usings">
<Using Include="Anyscale" />
<Using Include="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

<ItemGroup Label="Usings">
<Using Include="tryAGI.OpenAI" />
<Using Include="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

<ItemGroup Label="Usings">
<Using Include="HuggingFace" />
<Using Include="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

<ItemGroup Label="Usings">
<Using Include="LeonardoAi" />
<Using Include="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

<ItemGroup Label="Usings">
<Using Include="tryAGI.OpenAI" />
<Using Include="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
Expand Down
2 changes: 1 addition & 1 deletion src/libs/Sources/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<Import Project="..\Directory.Build.props" />

<PropertyGroup>
<RootNamespace>LangChain.Sources</RootNamespace>
<RootNamespace>LangChain.DocumentLoaders</RootNamespace>
</PropertyGroup>

<PropertyGroup Label="NuGet">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ namespace LangChain.Sources;
/// <summary>
///
/// </summary>
public class PdfSource : ISource
public class AsposePdfSource : ISource
{
/// <summary>
///
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

<ItemGroup>
<PackageReference Include="Aspose.PDF" />
<PackageReference Include="PdfPig" />
</ItemGroup>

</Project>
36 changes: 36 additions & 0 deletions src/libs/Sources/LangChain.Sources.Pdf/PdfPigPdfSource.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using UglyToad.PdfPig;

namespace LangChain.Sources;

/// <summary>
///
/// </summary>
public class PdfPigPdfSource : ISource
{
/// <summary>
///
/// </summary>
public required string Path { get; init; }

/// <inheritdoc/>
public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellationToken = default)
{
try
{
using PdfDocument document = PdfDocument.Open(Path, new ParsingOptions());
var pages = document.GetPages();
var content = String.Join("\n\n", pages.Select(page => page.Text));

var documents = (Document.Empty with
{
Content = content,
}).AsArray();

return Task.FromResult(documents);
}
catch (Exception exception)
{
return Task.FromException<IReadOnlyCollection<Document>>(exception);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net4.6.2;netstandard2.0;net6.0;net7.0</TargetFrameworks>
<NoWarn>$(NoWarn);CA1031</NoWarn>
</PropertyGroup>

<ItemGroup Label="Usings">
<Using Remove="System.Net.Http" />
</ItemGroup>

<PropertyGroup Label="NuGet">
<Description>LangChain web document source.</Description>
<PackageTags>$(PackageTags);pdf</PackageTags>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\LangChain.Sources.Abstractions\LangChain.Sources.Abstractions.csproj" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="AngleSharp" />
</ItemGroup>

</Project>
56 changes: 56 additions & 0 deletions src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
using AngleSharp;

namespace LangChain.Sources;

/// <summary>
///
/// </summary>
public class WebBaseSource : ISource
{
/// <summary>
///
/// </summary>
public required string Url { get; init; }

/// <inheritdoc/>
public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellationToken = default)
{
try
{
return LoadCoreAsync(Url);
}
catch (Exception exception)
{
return Task.FromException<IReadOnlyCollection<Document>>(exception);
}
}

protected async Task<IReadOnlyCollection<Document>> LoadCoreAsync(string url)
{
var config = Configuration.Default.WithDefaultLoader();
var context = BrowsingContext.New(config);
var document = await context.OpenAsync(url).ConfigureAwait(false);

foreach (var element in document.QuerySelectorAll("script, style, meta, link"))
{
element.Remove();
}

string content;
var html = document.QuerySelector("html");

if (html == null)
{
throw new NotSupportedException("Not supported for pages without <html> tag");
}

content = html.TextContent;

var documents = (Document.Empty with
{
Content = content
}).AsArray();

return documents;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net7.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\libs\Sources\LangChain.Sources.Pdf\LangChain.Sources.Pdf.csproj" />
</ItemGroup>

<ItemGroup>
<None Update="sample.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
25 changes: 25 additions & 0 deletions src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
namespace LangChain.Sources.Pdf.IntegrationTests;

[TestClass]
public class PdfSourceTests
{
[TestMethod]
public async Task PdfPig_CheckText()
{
var loader = new PdfPigPdfSource
{
Path = "sample.pdf"
};

var documents = await loader.LoadAsync();

documents.Should().NotBeEmpty();
var first = documents.First();

// check text from page 1
first.Content.Should().Contain("A Simple PDF File");

// check text from page 2
first.Content.Should().Contain("Simple PDF File 2");
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net7.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\libs\Sources\LangChain.Sources.WebBase\LangChain.Sources.WebBase.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
namespace LangChain.Sources.WebBase.IntegrationTests;

[TestClass]
public class WebBaseSourceTests
{
[TestMethod]
public async Task CheckText()
{
var loader = new WebBaseSource
{
Url = "https://en.wikipedia.org/wiki/Web_scraping"
};

var documents = await loader.LoadAsync();

documents.Should().NotBeEmpty();
var first = documents.First();

first.Content.Should().Contain("Web scraping, web harvesting, or web data extraction is");
first.Content.Should().Contain("This page was last edited on");
}
}

0 comments on commit e3ac3ca

Please sign in to comment.