-
Notifications
You must be signed in to change notification settings - Fork 4
/
main.py
130 lines (96 loc) · 4.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import os
import time
from importlib import metadata
from typing import Sequence
import dotenv
import together
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from extract import Person, extract_person
dotenv.load_dotenv()
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
if not TOGETHER_API_KEY:
raise ValueError("TOGETHER_API_KEY is not set")
together.api_key = TOGETHER_API_KEY
client = together.Together()
class CustomTogetherEmbeddingFn(EmbeddingFunction):
def __call__(self, input: Documents) -> Embeddings:
return get_embeddings(input)
def get_embeddings(texts: list[str]) -> list[Sequence[float]]:
texts = [text.replace("\n", " ") for text in texts]
outputs = client.embeddings.create(
input=texts, model="togethercomputer/m2-bert-80M-2k-retrieval")
return [outputs.data[i].embedding for i in range(len(texts))]
chroma_client = chromadb.PersistentClient(path="chromadb")
collection = chroma_client.get_or_create_collection(
name="background_embeddings", embedding_function=CustomTogetherEmbeddingFn())
# Load a JSON array from tree-messages.json
with open("messages-htn-calhacks.json", "r") as f:
tree_messages = json.loads(f.read())
all_extracted: list[Person] = []
for i, message in enumerate(tree_messages):
if len(message["String"]) < 150:
print(f"Skipping message {i}, too short.")
continue
# Extract the name and message from the message
name = message["Name"]
msg = message["String"]
# If person already exists in the database, skip
ident = f"{name}_{i}"
existing = collection.get(ids=[ident], include=["metadatas"])
if existing["ids"]:
metadata = existing["metadatas"][0]
person = Person(
background=metadata['background'],
interests=metadata['interests'],
major=metadata['major'] if 'major' in metadata else "",
name=metadata['name'] if 'name' in metadata else "",
school=metadata['school'] if 'school' in metadata else ""
)
all_extracted.append(person)
print(f"Skipping {ident}, already exists.")
continue
# Extract the person
person = extract_person(name, msg)
all_extracted.append(person)
# sleep every 5 iterations
time.sleep(1)
collection.upsert(
ids=[ident],
documents=[person.background +
" . Interests: " + person.interests],
metadatas=[{"name": name, "school": str(person.school),
"interests": person.interests, "background": person.background, "major": person.major}]
)
print(
f"\nEmbedded person {i}: {person.name} ({person.school})",)
print(person)
time.sleep(1)
# Write the extracted people to a file
with open("people.json", "w") as f:
f.write(json.dumps([person.model_dump()
for person in all_extracted], indent=2))
# print(person)
# collection = chroma_client.create_collection(name="my_collection")
# collection.add(
# documents=["This is a document",
# "This is another document",
# "I am currently a student (MBA) at Carnegie Mellon. My undergrad major is Computer Science. I worked for 4 years as a software engineer (full stack dev ). Participated in various hackathons such as RedHacks by Cornell, NASA Space App Challenge, and HackPrinceton. Very interested in AR/VR-related technology. I am very excited to create something great! Looking forward to creating something new in the healthcare or sustainability track.", # 3: VR healthcare
# "I'm a current senior at Berkeley studying CS. My main technical passions are computer vision, ML, AR/VR, but I'm excited about solving all types of problems. My most fluent languages are Python, Java, and C++. I have a good amount of experience with full stack development across a couple different frameworks.", # 4: ML, AR/VR
# "I have experience working in all sorts of technology stacks from frontend, backend, and also have experience working with LLM, and machine learning. I'm thinking about make a cool mobile app for the entertainment track.", # 5: LLM, ML
# ],
# ids=["1", "2", "3", "4", "5"]
# )
# results = collection.query(
# query_texts=[
# "LLM experience"],
# n_results=2
# )
# print(results)
results = collection.query(
query_texts=[
"I'm a current senior at Berkeley studying CS. My main technical passions are computer vision, ML, AR/VR, but I'm excited about solving all types of problems. My most fluent languages are Python, Java, and C++. I have a good amount of experience with full stack development across a couple different frameworks."],
n_results=10
)
print(results)