-
-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Added Web and PDF loaders (#39)
* WebBaseLoader & opernsouce pdf lib --------- Co-authored-by: Evgenii Khoroshev <[email protected]>
- Loading branch information
1 parent
8a50d69
commit e3ac3ca
Showing
20 changed files
with
228 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
using UglyToad.PdfPig; | ||
|
||
namespace LangChain.Sources; | ||
|
||
/// <summary> | ||
/// | ||
/// </summary> | ||
public class PdfPigPdfSource : ISource | ||
{ | ||
/// <summary> | ||
/// | ||
/// </summary> | ||
public required string Path { get; init; } | ||
|
||
/// <inheritdoc/> | ||
public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellationToken = default) | ||
{ | ||
try | ||
{ | ||
using PdfDocument document = PdfDocument.Open(Path, new ParsingOptions()); | ||
var pages = document.GetPages(); | ||
var content = String.Join("\n\n", pages.Select(page => page.Text)); | ||
|
||
var documents = (Document.Empty with | ||
{ | ||
Content = content, | ||
}).AsArray(); | ||
|
||
return Task.FromResult(documents); | ||
} | ||
catch (Exception exception) | ||
{ | ||
return Task.FromException<IReadOnlyCollection<Document>>(exception); | ||
} | ||
} | ||
} |
25 changes: 25 additions & 0 deletions
25
src/libs/Sources/LangChain.Sources.WebBase/LangChain.Sources.WebBase.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFrameworks>net4.6.2;netstandard2.0;net6.0;net7.0</TargetFrameworks> | ||
<NoWarn>$(NoWarn);CA1031</NoWarn> | ||
</PropertyGroup> | ||
|
||
<ItemGroup Label="Usings"> | ||
<Using Remove="System.Net.Http" /> | ||
</ItemGroup> | ||
|
||
<PropertyGroup Label="NuGet"> | ||
<Description>LangChain web document source.</Description> | ||
<PackageTags>$(PackageTags);pdf</PackageTags> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\LangChain.Sources.Abstractions\LangChain.Sources.Abstractions.csproj" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="AngleSharp" /> | ||
</ItemGroup> | ||
|
||
</Project> |
56 changes: 56 additions & 0 deletions
56
src/libs/Sources/LangChain.Sources.WebBase/WebBaseSource.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
using AngleSharp; | ||
|
||
namespace LangChain.Sources; | ||
|
||
/// <summary> | ||
/// | ||
/// </summary> | ||
public class WebBaseSource : ISource | ||
{ | ||
/// <summary> | ||
/// | ||
/// </summary> | ||
public required string Url { get; init; } | ||
|
||
/// <inheritdoc/> | ||
public Task<IReadOnlyCollection<Document>> LoadAsync(CancellationToken cancellationToken = default) | ||
{ | ||
try | ||
{ | ||
return LoadCoreAsync(Url); | ||
} | ||
catch (Exception exception) | ||
{ | ||
return Task.FromException<IReadOnlyCollection<Document>>(exception); | ||
} | ||
} | ||
|
||
protected async Task<IReadOnlyCollection<Document>> LoadCoreAsync(string url) | ||
{ | ||
var config = Configuration.Default.WithDefaultLoader(); | ||
var context = BrowsingContext.New(config); | ||
var document = await context.OpenAsync(url).ConfigureAwait(false); | ||
|
||
foreach (var element in document.QuerySelectorAll("script, style, meta, link")) | ||
{ | ||
element.Remove(); | ||
} | ||
|
||
string content; | ||
var html = document.QuerySelector("html"); | ||
|
||
if (html == null) | ||
{ | ||
throw new NotSupportedException("Not supported for pages without <html> tag"); | ||
} | ||
|
||
content = html.TextContent; | ||
|
||
var documents = (Document.Empty with | ||
{ | ||
Content = content | ||
}).AsArray(); | ||
|
||
return documents; | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
...ests/LangChain.Sources.Pdf.IntegrationTests/LangChain.Sources.Pdf.IntegrationTests.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>net7.0</TargetFramework> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\libs\Sources\LangChain.Sources.Pdf\LangChain.Sources.Pdf.csproj" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<None Update="sample.pdf"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
</None> | ||
</ItemGroup> | ||
|
||
</Project> |
25 changes: 25 additions & 0 deletions
25
src/tests/LangChain.Sources.Pdf.IntegrationTests/PdfSourceTests.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
namespace LangChain.Sources.Pdf.IntegrationTests; | ||
|
||
[TestClass] | ||
public class PdfSourceTests | ||
{ | ||
[TestMethod] | ||
public async Task PdfPig_CheckText() | ||
{ | ||
var loader = new PdfPigPdfSource | ||
{ | ||
Path = "sample.pdf" | ||
}; | ||
|
||
var documents = await loader.LoadAsync(); | ||
|
||
documents.Should().NotBeEmpty(); | ||
var first = documents.First(); | ||
|
||
// check text from page 1 | ||
first.Content.Should().Contain("A Simple PDF File"); | ||
|
||
// check text from page 2 | ||
first.Content.Should().Contain("Simple PDF File 2"); | ||
} | ||
} |
Binary file not shown.
11 changes: 11 additions & 0 deletions
11
...gChain.Sources.WebBase.IntegrationTests/LangChain.Sources.WebBase.IntegrationTests.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>net7.0</TargetFramework> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\libs\Sources\LangChain.Sources.WebBase\LangChain.Sources.WebBase.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
22 changes: 22 additions & 0 deletions
22
src/tests/LangChain.Sources.WebBase.IntegrationTests/WebBaseSourceTests.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
namespace LangChain.Sources.WebBase.IntegrationTests; | ||
|
||
[TestClass] | ||
public class WebBaseSourceTests | ||
{ | ||
[TestMethod] | ||
public async Task CheckText() | ||
{ | ||
var loader = new WebBaseSource | ||
{ | ||
Url = "https://en.wikipedia.org/wiki/Web_scraping" | ||
}; | ||
|
||
var documents = await loader.LoadAsync(); | ||
|
||
documents.Should().NotBeEmpty(); | ||
var first = documents.First(); | ||
|
||
first.Content.Should().Contain("Web scraping, web harvesting, or web data extraction is"); | ||
first.Content.Should().Contain("This page was last edited on"); | ||
} | ||
} |