FastAPI server for BM25 + FAISS ensemble search over your dictionary
—————————————————————
Install deps:
pip install -U fastapi uvicorn langchain-community langchain-text-splitters \
langchain-huggingface sentence-transformers faiss-cpu
Optional (pretty printing/logging): pip install rich
#
Run:
uvicorn app:app –host 0.0.0.0 –port 8001 –workers 1
#
Env overrides (optional):
export DICT_PATHS=”./dict_ko_2letter_20250813_181113.json,./completed_20250813_181113_dict_progress.json”
export EMB_MODEL=”sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2″
export BM25_K=5
export FAISS_K=10
#
Notes:
– This server includes a substring booster for Hangul queries to improve recall.
– Uses a multilingual embedding model by default for better KR queries.
from future import annotations
import os
import json
import re
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from fastapi import FastAPI, Query, HTTPException
from pydantic import BaseModel, Field
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
try:
# Prefer deprecation-safe embedding wrapper
from langchain_huggingface import HuggingFaceEmbeddings as HFEmb
except Exception:
from langchain_community.embeddings import HuggingFaceEmbeddings as HFEmb
—- Config —-
DEFAULT_FILES = [
“dict_ko_2letter_20250813_181113.json”,
“completed_20250813_181113_dict_progress.json”,
]
EMB_MODEL = os.getenv(“EMB_MODEL”, “sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2”)
BM25_K = int(os.getenv(“BM25_K”, 5))
FAISS_K = int(os.getenv(“FAISS_K”, 10))
Globals (loaded at startup)
DOCS: List[Document] = []
BM25: Optional[BM25Retriever] = None
FAISS_RET = None
ENTRIES_COUNT = 0
———- I/O: Load JSONs ———-
def _resolve_paths(paths: List[str]) -> List[Path]:
out: List[Path] = []
for p in paths:
pp = Path(p)
if not pp.exists():
alt = Path(“/mnt/data”) / pp.name
if alt.exists():
pp = alt
out.append(pp)
return out
def load_entries(paths: List[Path]) -> List[Dict[str, Any]]:
“””Load entries from various JSON shapes and normalize to a common schema.”””
entries: List[Dict[str, Any]] = []
def norm(w: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(w, dict) or not w.get("word"):
return {}
return {
"word": w.get("word", ""),
"pos": w.get("pos", ""),
"definition_en": w.get("definition_en", ""),
"example_en": w.get("example_en", ""),
"definition_target": w.get("definition_target", ""),
"example_target": w.get("example_target", ""),
"target_lang": w.get("target_lang", "") or w.get("targetLanguage", ""),
"rarity": w.get("rarity", 3),
"confidence": w.get("confidence", 0.0),
"prefix": w.get("prefix", ""),
}
for p in paths:
if not p.exists():
continue
try:
data = json.loads(p.read_text(encoding="utf-8"))
except Exception as e:
print(f"[WARN] Failed to read {p}: {e}")
continue
added = 0
if isinstance(data, dict):
if isinstance(data.get("words"), list):
for w in data["words"]:
e = norm(w)
if e:
entries.append(e)
added += 1
if isinstance(data.get("entries"), list):
for w in data["entries"]:
e = norm(w)
if e:
entries.append(e)
added += 1
if isinstance(data.get("documents"), list):
# LangChain export style {"documents": [{page_content, metadata}, ...]}
for d in data["documents"]:
if not isinstance(d, dict):
continue
meta = d.get("metadata") or {}
pc = (d.get("page_content") or "")
lines = [ln.strip() for ln in pc.splitlines() if ln.strip()]
bucket = {}
for ln in lines:
low = ln.lower()
if low.startswith("word:"):
bucket["word"] = ln.split(":", 1)[1].strip()
elif low.startswith("part of speech:") or low.startswith("pos:"):
bucket["pos"] = ln.split(":", 1)[1].strip()
elif low.startswith("english definition:") or low.startswith("en:"):
bucket["definition_en"] = ln.split(":", 1)[1].strip()
elif low.startswith("english example:") or low.startswith("ex:"):
bucket["example_en"] = ln.split(":", 1)[1].strip()
elif low.startswith("korean definition:") or low.startswith("ko:"):
bucket["definition_target"] = ln.split(":", 1)[1].strip()
elif low.startswith("korean example:") or low.startswith("ko_ex:"):
bucket["example_target"] = ln.split(":", 1)[1].strip()
e = {
"word": meta.get("word", "") or bucket.get("word", ""),
"pos": meta.get("pos", "") or bucket.get("pos", ""),
"definition_en": bucket.get("definition_en", ""),
"example_en": bucket.get("example_en", ""),
"definition_target": bucket.get("definition_target", ""),
"example_target": bucket.get("example_target", ""),
"target_lang": meta.get("target_language", meta.get("target_lang", "")) or "ko",
"rarity": meta.get("rarity", 3),
"confidence": meta.get("confidence", 0.0),
"prefix": meta.get("prefix", ""),
}
if e.get("word"):
entries.append(e)
added += 1
elif isinstance(data, list):
for w in data:
e = norm(w)
if e:
entries.append(e)
added += 1
print(f"[LOAD] {p.name}: {added} entries")
return entries
———- Convert to LangChain Documents ———-
def to_documents(entries: List[Dict[str, Any]]) -> List[Document]:
docs: List[Document] = []
for e in entries:
word = (e.get(“word”) or “”).strip()
if not word:
continue
pos = e.get(“pos”, “”)
de = e.get(“definition_en”, “”)
ee = e.get(“example_en”, “”)
dt = e.get(“definition_target”, “”)
et = e.get(“example_target”, “”)
content_lines = [
f”{word} ({pos})”.strip(),
f”EN: {de}”.strip(),
f”EX: {ee}”.strip(),
]
if dt:
content_lines.append(f”KO: {dt}”)
if et:
content_lines.append(f”KO_EX: {et}”)
page_content = “\n”.join([ln for ln in content_lines if ln and ln != “()”])
metadata = {
“word”: word,
“pos”: pos,
“prefix”: e.get(“prefix”, “”),
“rarity”: e.get(“rarity”, 3),
“confidence”: e.get(“confidence”, 0.0),
“target_lang”: e.get(“target_lang”, “”) or “ko”,
}
docs.append(Document(page_content=page_content, metadata=metadata))
return docs
———- Utilities ———-
def _has_hangul(text: str) -> bool:
return any(‘\u3131’ <= ch <= ‘\ud7a3’ for ch in text)
def substring_fallback(query: str, docs: List[Document], k: int = 10) -> List[Document]: terms = [t for t in re.split(r”\s+”, query.strip()) if t] if not terms: return [] scored: List[Tuple[int, Document]] = [] for d in docs: pc = d.page_content or “” score = sum(pc.count(t) for t in terms) if score > 0: scored.append((score, d)) scored.sort(key=lambda x: x[0], reverse=True) return [d for , d in scored[:k]]
def _set_k_if_possible(retriever, k: int):
try:
retriever.k = k
except Exception:
try:
retriever.search_kwargs[“k”] = k
except Exception:
pass
def _invoke_safe(retriever, q: str) -> List[Document]:
try:
return retriever.invoke(q)
except Exception:
return retriever.get_relevant_documents(q)
def _parse_fields(doc: Document) -> Dict[str, Any]:
meta = doc.metadata or {}
out = {
“word”: meta.get(“word”, “”),
“pos”: meta.get(“pos”, “”),
“prefix”: meta.get(“prefix”, “”),
“confidence”: meta.get(“confidence”, 0.0),
“definition_en”: “”,
“example_en”: “”,
“definition_ko”: “”,
“example_ko”: “”,
}
for ln in (doc.page_content or “”).splitlines():
if ln.startswith(“EN:”):
out[“definition_en”] = ln[3:].strip()
elif ln.startswith(“EX:”):
out[“example_en”] = ln[3:].strip()
elif ln.startswith(“KO:”):
out[“definition_ko”] = ln[3:].strip()
elif ln.startswith(“KO_EX:”):
out[“example_ko”] = ln[6:].strip()
return out
———- Build Indexes ———-
def build_indexes(paths: List[str], bm25_k: int = BM25_K, faiss_k: int = FAISS_K, emb_model: str = EMB_MODEL):
global DOCS, BM25, FAISS_RET, ENTRIES_COUNT
files = _resolve_paths(paths)
entries = load_entries(files)
if not entries:
raise RuntimeError(“No entries loaded. Check DICT_PATHS or file paths.”)
DOCS = to_documents(entries)
ENTRIES_COUNT = len(DOCS)
# BM25
BM25 = BM25Retriever.from_documents(DOCS)
BM25.k = bm25_k
# FAISS
embed = HFEmb(model_name=emb_model)
vs = FAISS.from_documents(DOCS, embed)
FAISS_RET = vs.as_retriever(search_kwargs={"k": faiss_k})
———- Search Logic (manual merge for dynamic weights) ———-
def ensemble_search(q: str, k: int = 10, w_bm25: float = 0.6, w_faiss: float = 0.4) -> List[Dict[str, Any]]:
if BM25 is None or FAISS_RET is None:
raise RuntimeError(“Indexes not built yet”)
# Adjust per-query k
_set_k_if_possible(BM25, max(k, BM25_K))
_set_k_if_possible(FAISS_RET, max(k, FAISS_K))
bm25_docs = _invoke_safe(BM25, q)
faiss_docs = _invoke_safe(FAISS_RET, q)
# Hangul-aware substring booster
sub_docs: List[Document] = []
sub_weight = 0.15
if _has_hangul(q):
sub_docs = _substring_fallback(q, DOCS, k=max(10, k))
sub_weight = 0.5
# Rank maps
def rank_map(docs: List[Document]) -> Dict[str, int]:
m = {}
for i, d in enumerate(docs):
w = (d.metadata or {}).get("word", "")
if w and w not in m:
m[w] = i
return m
r_bm = rank_map(bm25_docs)
r_fa = rank_map(faiss_docs)
# Substring counts for normalization
sub_counts: Dict[str, int] = {}
max_sub = 0
if sub_docs:
terms = [t for t in re.split(r"\s+", q.strip()) if t]
for d in sub_docs:
pc = d.page_content or ""
cnt = sum(pc.count(t) for t in terms)
w = (d.metadata or {}).get("word", "")
if w:
sub_counts[w] = cnt
max_sub = max(max_sub, cnt)
# Collect pool
pool: Dict[str, Tuple[float, Document]] = {}
for d in sub_docs + bm25_docs + faiss_docs:
w = (d.metadata or {}).get("word", "")
if not w:
# fallback to first line
w = (d.page_content or "").splitlines()[0].split(" ")[0]
bm_score = w_bm25 * (1.0 / (1 + r_bm.get(w, 1_000_000_000)))
fa_score = w_faiss * (1.0 / (1 + r_fa.get(w, 1_000_000_000)))
sub_score = 0.0
if max_sub > 0 and w in sub_counts:
sub_score = sub_weight * (sub_counts[w] / max_sub)
score = bm_score + fa_score + sub_score
prev = pool.get(w)
if (prev is None) or (score > prev[0]):
pool[w] = (score, d)
ordered = sorted(pool.values(), key=lambda x: x[0], reverse=True)
top = [d for _, d in ordered[:k]]
return [_parse_fields(d) for d in top]
———- API ———-
class SearchResponse(BaseModel):
query: str
k: int
results: List[Dict[str, Any]]
class HealthResponse(BaseModel):
status: str = Field(“ok”)
entries: int
bm25_k: int
faiss_k: int
emb_model: str
app = FastAPI(title=”Dictionary Search API”, version=”1.0.0″)
@app.on_event(“startup”)
def _startup():
paths_env = os.getenv(“DICT_PATHS”)
if paths_env:
paths = [p.strip() for p in paths_env.split(“,”) if p.strip()]
else:
paths = DEFAULT_FILES
print(f”[STARTUP] Loading files: {paths}”)
build_indexes(paths, bm25_k=BM25_K, faiss_k=FAISS_K, emb_model=EMB_MODEL)
print(f”[READY] Loaded {ENTRIES_COUNT} entries. Model={EMB_MODEL}”)
@app.get(“/health”, response_model=HealthResponse)
def health():
return HealthResponse(entries=ENTRIES_COUNT, bm25_k=BM25_K, faiss_k=FAISS_K, emb_model=EMB_MODEL)
@app.get(“/search”, response_model=SearchResponse)
def search(
q: str = Query(…, min_length=1, description=”Search query (KR/EN supported)”),
k: int = Query(10, ge=1, le=50),
w_bm25: float = Query(None, description=”BM25 weight (0-1)”),
w_faiss: float = Query(None, description=”FAISS weight (0-1)”),
):
if BM25 is None or FAISS_RET is None:
raise HTTPException(status_code=503, detail=”Indexes not ready”)
# Sensible defaults depending on language
if w_bm25 is None or w_faiss is None:
if _has_hangul(q):
w_bm25, w_faiss = 0.3, 0.7
else:
w_bm25, w_faiss = 0.6, 0.4
results = ensemble_search(q, k=k, w_bm25=w_bm25, w_faiss=w_faiss)
return SearchResponse(query=q, k=k, results=results)
————- CLI entry (optional for quick test) ————-
if name == “main“:
# Quick local test without uvicorn: python app_dict_search_fastapi.py (then manual queries)
paths_env = os.getenv(“DICT_PATHS”)
if paths_env:
paths = [p.strip() for p in paths_env.split(“,”) if p.strip()]
else:
paths = DEFAULT_FILES
build_indexes(paths, bm25_k=BM25_K, faiss_k=FAISS_K, emb_model=EMB_MODEL)
print(“Ready. Type your query (empty to exit).\n”)
while True:
try:
q = input(“query> “).strip()
except (EOFError, KeyboardInterrupt):
break
if not q:
break
res = ensemble_search(q, k=10)
for i, r in enumerate(res, 1):
print(f”[{i}] {r[‘word’]} ({r[‘pos’]}) | EN={r[‘definition_en’][:50]} | KO={r[‘definition_ko’][:50]}”)