Skip to content

Commit

Permalink
feat: Switch to API for Unstructured Component (#3671)
Browse files Browse the repository at this point in the history
* FEAT: Switch to API for Unstructured Component

* chore: new lock

---------

Co-authored-by: Gabriel Luiz Freitas Almeida <[email protected]>
  • Loading branch information
erichare and ogabrielluiz authored Sep 4, 2024
1 parent d93c907 commit de1fdff
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 40 deletions.
39 changes: 31 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ bson = "^0.5.10"
lark = "^1.2.2"
jq = "^1.8.0"
clickhouse-connect = {version = "0.7.19", optional = true, extras = ["clickhouse-connect"]}
langchain-unstructured = "^0.1.2"


[tool.poetry.group.dev.dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
import os

from typing import List

from langflow.custom import Component
from langflow.inputs import FileInput, SecretStrInput
from langflow.template import Output
from langflow.schema import Data

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_unstructured import UnstructuredLoader


class UnstructuredComponent(Component):
display_name = "Unstructured"
description = "Unstructured data loader"
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
trace_type = "tool"
icon = "Unstructured"
Expand All @@ -23,27 +21,29 @@ class UnstructuredComponent(Component):
name="file",
display_name="File",
required=True,
info="The path to the file with which you want to use Unstructured to parse",
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
),
SecretStrInput(
name="api_key",
display_name="API Key",
required=False,
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
display_name="Unstructured.io Serverless API Key",
required=True,
info="Unstructured API Key. Create at: https://app.unstructured.io/",
),
]

outputs = [
Output(name="data", display_name="Data", method="load_documents"),
]

def build_unstructured(self) -> UnstructuredFileLoader:
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key

def build_unstructured(self) -> UnstructuredLoader:
file_paths = [self.file]

loader = UnstructuredFileLoader(file_paths)
loader = UnstructuredLoader(
file_paths,
api_key=self.api_key,
partition_via_api=True,
)

return loader

Expand Down
17 changes: 0 additions & 17 deletions src/backend/base/langflow/initial_setup/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import shutil
import time
import nltk
from collections import defaultdict
from copy import deepcopy
from datetime import datetime, timezone
Expand Down Expand Up @@ -613,19 +612,3 @@ def initialize_super_user_if_needed():
get_variable_service().initialize_user_variables(super_user.id, session)
create_default_folder_if_it_doesnt_exist(session, super_user.id)
logger.info("Super user initialized")


# Function to download NLTK packages if not already downloaded
def download_nltk_resources():
nltk_resources = {
"corpora": ["wordnet"],
"taggers": ["averaged_perceptron_tagger"],
"tokenizers": ["punkt", "punkt_tab"],
}

for category, packages in nltk_resources.items():
for package in packages:
try:
nltk.data.find(f"{category}/{package}")
except LookupError:
nltk.download(package)
3 changes: 0 additions & 3 deletions src/backend/base/langflow/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,6 @@ async def exception_handler(request: Request, exc: Exception):

FastAPIInstrumentor.instrument_app(app)

# Get necessary NLTK packages
# download_nltk_resources()

return app


Expand Down

0 comments on commit de1fdff

Please sign in to comment.