1. ํ๋ก์ ํธ ๊ฐ์
1.1 ํ๋ก์ ํธ ๋ชฉ์
๋ฐ์ดํฐ ํ์ผ์ ์ ์ฒ๋ฆฌํ๊ณ ์๋ฒ ๋ฉ ๋ฒกํฐ๋ก ๋ณํํ ๋ค,
์ด๋ฅผ **๋ฒกํฐ ์คํ ์ด(Vector Store)**์ ์ ์ฅํ์ฌ Retrieval Augmented Generation(RAG) ๋ฐฉ์์ผ๋ก
์ผ๋ฐ์ ์ธ ๋ํ๊ฐ ๊ฐ๋ฅํ ๋๋ง์ ์ฑ๋ด์ ๊ตฌํํ๋ค.
1.2 ์งํ ์ ์ฐจ
โ
ํ์ต ๋ฐ์ดํฐ ์ค๋น ๋ฐ ์ฒ๋ฆฌ
โ
ํ์ต ๋ฐ์ดํฐ Embedding
โ
Embedding ๋ฐ์ดํฐ ์ ์ฅ
โ
์ง๋ฌธ ์ ๊ด๋ จ ๋ฌธ๋งฅ ๊ฒ์ โ LLM ๋ต๋ณ ์์ฑ
โ
์ฑ๋ด ์ธํฐ๋์
์คํ
2. ๊ตฌํํ๊ธฐ
2.1 ํ์ต ๋ฐ์ดํฐ ์ฒ๋ฆฌํ๊ธฐ : 01_chatbot_dataget.py
์์ ๋ฐ์ดํฐ(my_data.txt
)๋ฅผ ๋ถ๋ฌ์์ CSV๋ก ์ ๋ฆฌํ๋ค.
import pandas as pd
import re
def remove_newlines(text):
text = re.sub(r'\n', ' ', text)
text = re.sub(r' +', ' ', text)
return text
def text_to_df(data_file):
texts = []
with open(data_file, 'r', encoding="utf-8") as file:
text = file.read()
sections = text.split('\n\n')
for section in sections:
lines = section.split('\n')
fname = lines[0]
content = ' '.join(lines[1:])
texts.append([fname, content])
df = pd.DataFrame(texts, columns=['title', 'text'])
df['text'] = df['text'].apply(remove_newlines)
return df
# ์คํ ์์
df = text_to_df('my_data.txt') # โ ์ฌ๊ธฐ์ ๋ณธ์ธ ๋ฐ์ดํฐ ๋ฃ๊ธฐ
df.to_csv('processed.csv', index=False, encoding='utf-8')
df.head()
2.2 ๋ฐ์ดํฐ ์๋ฒ ๋ฉํ๊ธฐ : 02_chatbot_embedding.py
OpenAI ์๋ฒ ๋ฉ ๋ชจ๋ธ์ ํ์ฉํ์ฌ ํ ์คํธ๋ฅผ ๋ฒกํฐํํ๋ค.
!pip install openai tiktoken python-dotenv pandas
import os, time
import pandas as pd
import tiktoken
from openai import OpenAI
from dotenv import load_dotenv
# API ํค ๋ก๋
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 1500
df = pd.read_csv("processed.csv")
df.columns = ['title','text']
tokenizer = tiktoken.get_encoding(embedding_encoding)
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
# ์๋ฒ ๋ฉ ํจ์
def get_embedding(text, model=embedding_model, max_retries=5):
text = text.replace("\n", " ")
retries = 0
while retries < max_retries:
try:
response = client.embeddings.create(input=[text], model=model)
return response.data[0].embedding
except Exception as e:
retries += 1
print(f"Error: {e}. Retrying...")
time.sleep(2**retries)
raise Exception(f"Failed embedding: {text}")
df["embeddings"] = df.text.apply(lambda x: get_embedding(x))
df.to_csv("embeddings.csv", index=False)
df.head()
2.3 ๋ฌธ๋งฅ ๊ฒ์ ํจ์ : 04_search.py
์ง๋ฌธ๊ณผ ์ ์ฅ๋ ์๋ฒ ๋ฉ์ ๋น๊ตํด ๋ฌธ๋งฅ(Context)์ ๋ฝ๋๋ค.
import numpy as np
from scipy import spatial
def distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine"):
distance_metrics = {
"cosine": spatial.distance.cosine,
"L1": spatial.distance.cityblock,
"L2": spatial.distance.euclidean,
"Linf": spatial.distance.chebyshev,
}
return [distance_metrics[distance_metric](query_embedding, emb) for emb in embeddings]
def create_context(question, df, max_len=1800):
q_embeddings = client.embeddings.create(
input=[question], model=embedding_model
).data[0].embedding
df['distances'] = distances_from_embeddings(
q_embeddings, df['embeddings'].apply(eval).apply(np.array).values
)
returns, cur_len = [], 0
for _, row in df.sort_values('distances', ascending=True).iterrows():
cur_len += row['n_tokens'] + 4
if cur_len > max_len: break
returns.append(row["text"])
return "\n\n###\n\n".join(returns)
2.4 ๋ต๋ณ ์์ฑ ํจ์ : rag_search.py
๊ฒ์๋ ๋ฌธ๋งฅ์ ํ์ฉํด LLM์ด ๋ต๋ณํ๋ค. (๋ฒ์ฉ ์ฑ๋ด์ด๋ฏ๋ก ํธํ ๋งฅ๋ฝ ์ ๊ฑฐ!)
def answer_question(question, conversation_history, df):
context = create_context(question, df)
prompt = f"""๋น์ ์ ์น์ ํ๊ณ ๋๋ํ AI ๋น์์
๋๋ค.
์ฃผ์ด์ง ๋ฌธ๋งฅ์ ์ฐธ๊ณ ํ์ฌ ์ง๋ฌธ์ ๋ตํ์ธ์.
๋ง์ฝ ๋ฌธ๋งฅ์์ ๋ต์ ์ฐพ์ ์ ์๋ค๋ฉด '์ ๋ชจ๋ฅด๊ฒ ์ต๋๋ค'๋ผ๊ณ ๋๋ตํ์ธ์.
๋ฌธ๋งฅ:
{context}
์ง๋ฌธ: {question}
๋ต๋ณ:"""
conversation_history.append({"role":"user","content":prompt})
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=conversation_history,
temperature=0.7,
)
return response.choices[0].message.content.strip()
2.5 ์ฑ๋ด ์คํํ๊ธฐ : 05_rag_chatbot.py
import pandas as pd
df = pd.read_csv("embeddings.csv")
conversation_history = []
print("๋๋ง์ ์ฑ๋ด ์์! (์ข
๋ฃํ๋ ค๋ฉด exit ์
๋ ฅ)")
while True:
user_input = input("You: ")
if user_input.lower() == "exit":
break
answer = answer_question(user_input, conversation_history, df)
print("Bot:", answer)
conversation_history.append({"role":"assistant","content":answer})
๐ ์คํ ๋ฐฉ๋ฒ (๊ตฌ๊ธ ์ฝ๋ฉ)
- ์ฝ๋ฉ ์ ๋ ธํธ๋ถ ์ด๊ธฐ
- ์ ์ฝ๋ ๋ธ๋ก์ ์์๋๋ก ์คํ
- ๋ณธ์ธ ๋ฐ์ดํฐ(
my_data.txt
) ์ ๋ก๋ - ๋ง์ง๋ง
์ฑ๋ด ์คํ
๋ถ๋ถ์์ ์์ ๋กญ๊ฒ ๋ํ
๋ต๊ธ ๋จ๊ธฐ๊ธฐ