-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup_data.py
More file actions
51 lines (42 loc) · 1.58 KB
/
setup_data.py
File metadata and controls
51 lines (42 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from langchain_community.vectorstores import Chroma
from langchain_upstage import UpstageEmbeddings
from langchain.docstore.document import Document
import json, os
from dotenv import load_dotenv
# 환경변수 로드
load_dotenv()
os.environ["CHROMA_TELEMETRY"] = "FALSE"
def load_jsonl(file_path):
documents = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
data = json.loads(line)
content = data["content"]
metadata = data.get("metadata", {})
cleaned_metadata = {}
for key, value in metadata.items():
if isinstance(value, (str, int, float, bool)):
cleaned_metadata[key] = value
else:
cleaned_metadata[key] = json.dumps(value, ensure_ascii=False)
documents.append(Document(page_content=content, metadata=cleaned_metadata))
return documents
persona = load_jsonl("data/yjs-persona.jsonl")
info = load_jsonl("data/yjs-info.jsonl")
print("유재석 페르소나 임베딩을 시작합니다.")
embedding = UpstageEmbeddings(model="solar-embedding-1-large")
vectorstore = Chroma.from_documents(
documents=persona,
embedding=embedding,
persist_directory="./chroma_db",
collection_name="yujaeseuk_persona"
)
print("유재석 페르소나 임베딩 완료!")
print("유재석 정보 임베딩을 시작합니다.")
vectorstore = Chroma.from_documents(
documents=info,
embedding=embedding,
persist_directory="./chroma_db",
collection_name="yujaeseuk_data"
)
print("유재석 정보 임베딩 완료!")