Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
adding parsers and loaders
Browse files Browse the repository at this point in the history
  • Loading branch information
cfortuner committed Mar 3, 2023
1 parent 4256960 commit 0741ca3
Show file tree
Hide file tree
Showing 31 changed files with 1,055 additions and 192 deletions.
1 change: 0 additions & 1 deletion examples/src/qa-from-notes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import chalk from "chalk";
import {
CharacterTextSplitter,
FileLoader,
ListParser,
OpenAI,
promptTemplates,
} from "@promptable/promptable";
Expand Down
16 changes: 16 additions & 0 deletions packages/promptable/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,40 @@
"@babel/preset-env": "^7.20.2",
"@babel/preset-typescript": "^7.18.6",
"@jest/globals": "^29.4.1",
"@types/css": "^0.0.33",
"@types/html-to-text": "^9.0.0",
"@types/jest": "^29.4.0",
"@types/js-yaml": "^4.0.5",
"@types/marked": "^4.0.8",
"@types/node": "^18.11.18",
"@types/pdf-parse": "^1.1.1",
"@types/showdown": "^2.0.0",
"@types/uuid": "^9.0.0",
"babel-jest": "^29.4.1",
"dotenv": "^16.0.3",
"jest": "^29.4.1",
"ts-jest": "^29.0.5",
"tsup": "^6.5.0"
},
"dependencies": {
"@notionhq/client": "^2.2.3",
"@octokit/rest": "^19.0.7",
"@slack/web-api": "^6.8.1",
"@stdlib/nlp-sentencize": "^0.0.2",
"axios": "^1.2.4",
"chalk": "^4.1.2",
"cheerio": "1.0.0-rc.12",
"css": "^3.0.0",
"csv-parse": "^5.3.4",
"form-data": "^4.0.0",
"googleapis": "^112.0.0",
"gpt3-tokenizer": "^1.1.4",
"html-to-text": "^9.0.4",
"js-yaml": "^4.1.0",
"marked": "^4.2.12",
"openai": "^3.1.0",
"pdf-parse": "^1.1.1",
"showdown": "^2.1.0",
"stream-to-blob": "^2.0.1",
"typescript": "latest",
"uuid": "^9.0.0",
Expand Down
38 changes: 0 additions & 38 deletions packages/promptable/src/Document.ts

This file was deleted.

61 changes: 61 additions & 0 deletions packages/promptable/src/documents/Document.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { v4 as uuid } from "uuid";

export interface Metadata {
[key: string]: any;
}

export class Document {
metadata: Metadata = {};

constructor(metadata?: Metadata) {
if (metadata) {
this.metadata = metadata;
}
}
}

export class TextDocument extends Document {
text: string;
embedding?: number[];

constructor(text: string, metadata?: Metadata) {
super(metadata);
this.text = text;
}
}

export class ImageDocument extends Document {
image: Uint8Array; // blob
embedding?: number[];

constructor(image: Uint8Array, metadata?: Metadata) {
super(metadata);
this.image = image;
}
}

export class AudioDocument extends Document {
audio: Uint8Array; // blob
embedding?: number[];

constructor(audio: Uint8Array, metadata?: Metadata) {
super(metadata);
this.audio = audio;
}
}

// example of a multi-modal document
class MultiModalDocument extends Document {
textDoc: TextDocument;
imageDoc: ImageDocument;

constructor(
textDoc: TextDocument,
imageDoc: ImageDocument,
metadata?: Metadata
) {
super(metadata);
this.textDoc = textDoc;
this.imageDoc = imageDoc;
}
}
15 changes: 0 additions & 15 deletions packages/promptable/src/embeddings/extractor.ts

This file was deleted.

2 changes: 1 addition & 1 deletion packages/promptable/src/embeddings/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import fs from "fs";
import chalk from "chalk";
import { EmbeddingsModelProvider } from "@providers/ModelProvider";
import { Document } from "../Document";
import { Document } from "../documents/Document";
import { FileEmbeddingsStore } from "./stores/FileEmbeddingsStore";
import { EmbeddingsStore } from "./stores/EmbeddingsStore";

Expand Down
4 changes: 2 additions & 2 deletions packages/promptable/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
export { BufferedChatMemory } from "src/memories/BufferedChatMemory";

import { Document } from "./Document";
import { Document } from "./documents/Document";
export type { Document };

// Prebuilt prompts
Expand Down Expand Up @@ -58,7 +58,7 @@ import { FileLoader } from "@loaders/FileLoader";
export { FileLoader, HTMLLoader };

// Parsing
import { JSONParser, CSVParser, Parser, ListParser } from "@prompts/Parser";
import { JSONParser, CSVParser, Parser, ListParser } from "@parsers/index";
export type { Parser };
export { JSONParser, CSVParser, ListParser };

Expand Down
2 changes: 1 addition & 1 deletion packages/promptable/src/indices/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { v4 as uuid } from "uuid";
import { Document } from "src/Document";
import { Document } from "src/documents/Document";

export abstract class Index {
constructor(documents: Document[] = []) {
Expand Down
75 changes: 58 additions & 17 deletions packages/promptable/src/loaders/FileLoader.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,66 @@
import { v4 as uuid } from "uuid";
import fs from "fs";
import {
AudioDocument,
Document,
ImageDocument,
TextDocument,
} from "src/documents/Document";
import { Loader } from ".";

export class FileLoader implements Loader {
path: string;
meta?: Record<string, any>;
async load(filepaths: string[]): Promise<Document[]> {
const documents: Document[] = [];

constructor(path: string, meta?: Record<string, any>) {
this.path = path;
this.meta = meta;
}
for (const filepath of filepaths) {
try {
const stat = fs.statSync(filepath);

if (stat.isDirectory()) {
const files = fs.readdirSync(filepath);
const dirDocuments = await this.load(
files.map((f) => `${filepath}/${f}`)
);
documents.push(...dirDocuments);
} else {
const data = await fs.promises.readFile(filepath);
const filename = filepath.split("/").pop();
const extension = filename?.split(".").pop();
if (!extension) {
throw new Error(`No extension found for file ${filepath}`);
}

let document: Document;

switch (extension) {
case "png":
case "jpg":
case "jpeg":
case "svg":
document = new ImageDocument(data, { source: filepath });
break;
case "wav":
case "mp4":
case "mp3":
document = new AudioDocument(data, { source: filepath });
break;
case "txt":
case "csv":
case "json":
default:
document = new TextDocument(data.toString(), {
source: filepath,
});
break;
}

documents.push(document);
}
} catch (error) {
console.error(`Error loading file ${filepath}: ${error}`);
}
}

async load() {
const data = await fs.promises.readFile(this.path, "utf-8");
return [
{
data,
meta: {
source: this.path,
...this.meta,
},
},
];
return documents;
}
}
41 changes: 41 additions & 0 deletions packages/promptable/src/loaders/GithubLoader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import { Octokit } from "@octokit/rest";
import { TextDocument } from "src/documents/Document";
import { Loader } from ".";

export class GithubLoader implements Loader {
private octokit: Octokit;

constructor(token: string) {
this.octokit = new Octokit({ auth: token });
}

async load(repo: string, path: string, ref?: string) {
const { data } = await this.octokit.repos.getContent({
owner: repo.split("/")[0],
repo: repo.split("/")[1],
path,
ref,
});
if (Array.isArray(data)) {
// handle directory
const documents: any[] = await Promise.all(
data.map(async (item: any) => await this.load(repo, item.path, ref))
);

return documents.flat();
} else {
// handle file
const { data: file } = await this.octokit.repos.getContent({
owner: repo.split("/")[0],
repo: repo.split("/")[1],
path: data.path,
ref,
});

//@ts-ignore
const text = Buffer.from(file.content || "", "base64").toString();
const source = `https://github.com/${repo}/blob/${ref}/${path}`;
return [new TextDocument(text, { source })];
}
}
}
38 changes: 38 additions & 0 deletions packages/promptable/src/loaders/GmailLoader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { google } from "googleapis";
import { TextDocument } from "src/documents/Document";
import { Loader } from ".";

export class GmailLoader implements Loader {
private gmail: any;

constructor(credentials: any, token: any) {
const auth = new google.auth.OAuth2(
credentials.client_id,
credentials.client_secret,
credentials.redirect_uris[0]
);
auth.setCredentials(token);
this.gmail = google.gmail({ version: "v1", auth });
}

async load(messageId: string) {
const { data: message } = await this.gmail.users.messages.get({
userId: "me",
id: messageId,
format: "full",
});
const text = message.payload.parts
.map((part: any) => part.body.data)
.join("");
const headers = message.payload.headers.reduce((acc: any, header: any) => {
acc[header.name] = header.value;
return acc;
}, {});
return [
new TextDocument(text, {
source: `https://mail.google.com/mail/u/0/#inbox/${messageId}`,
headers,
}),
];
}
}
Loading

0 comments on commit 0741ca3

Please sign in to comment.