-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest.py
68 lines (57 loc) · 2.36 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
from pathlib import Path
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_core.indexing import index, InMemoryRecordManager
import yaml
CUR_DIR = Path(__file__).parent
def main() -> None:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
# TODO: Replace with non-local vectorstore.
vecstore = Chroma("dbt_demo", embedding_function=embedding, persist_directory=".")
# TODO: Using in memory record manager for demo purposes, replace with persistent
# manager.
record_manager = InMemoryRecordManager(namespace="dbt_demo")
docs = chunk_manifest(CUR_DIR / "semantic_manifest.yaml")
res = index(docs, record_manager, vecstore)
print(res)
"""
Questions for improving index/retrieval:
- Do we need to chunk and retrieve at all? How big is the typical manifest, could
it just all be in context?
- If the whole manifest doesn't fit, could just the names of all models/entities/
dimensions/metrics fit in context? And then model can choose to look up the
definitions of the most relevant objects?
- Assuming retrieval is needed, probably would get better results through hybrid
search that combines keyword and semantic search.
- What's the right information to be in each vector? Could it just be the name and
description of the object? Does it help to include parent descriptions (eg the
semantic model description)?
- Would parent-document retrieval help?
- Would query analysis, with things like query decomposition, help?
"""
def chunk_manifest(path):
with open(path, "r") as f:
manifest = yaml.safe_load(f)
docs = []
categories = ("entities", "dimensions", "measures")
for model in manifest["semantic_models"]:
for category in categories:
for val in model.get(category, []):
docs.append(
Document(
json.dumps(val, indent=2),
metadata={"category": category},
)
)
for metric in manifest["metrics"]:
docs.append(
Document(
json.dumps(metric, indent=2),
metadata={"category": "metrics"},
)
)
return docs
if __name__ == "__main__":
main()