[작성자:] sp

  • dict codes

    FastAPI server for BM25 + FAISS ensemble search over your dictionary

    —————————————————————

    Install deps:

    pip install -U fastapi uvicorn langchain-community langchain-text-splitters \

    langchain-huggingface sentence-transformers faiss-cpu

    Optional (pretty printing/logging): pip install rich

    #

    Run:

    uvicorn app:app –host 0.0.0.0 –port 8001 –workers 1

    #

    Env overrides (optional):

    export DICT_PATHS=”./dict_ko_2letter_20250813_181113.json,./completed_20250813_181113_dict_progress.json”

    export EMB_MODEL=”sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2″

    export BM25_K=5

    export FAISS_K=10

    #

    Notes:

    – This server includes a substring booster for Hangul queries to improve recall.

    – Uses a multilingual embedding model by default for better KR queries.

    from future import annotations
    import os
    import json
    import re
    from pathlib import Path
    from typing import List, Dict, Any, Optional, Tuple

    from fastapi import FastAPI, Query, HTTPException
    from pydantic import BaseModel, Field

    from langchain_community.retrievers import BM25Retriever
    from langchain_community.vectorstores import FAISS
    from langchain_core.documents import Document

    try:
    # Prefer deprecation-safe embedding wrapper
    from langchain_huggingface import HuggingFaceEmbeddings as HFEmb
    except Exception:
    from langchain_community.embeddings import HuggingFaceEmbeddings as HFEmb

    —- Config —-

    DEFAULT_FILES = [
    “dict_ko_2letter_20250813_181113.json”,
    “completed_20250813_181113_dict_progress.json”,
    ]
    EMB_MODEL = os.getenv(“EMB_MODEL”, “sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2”)
    BM25_K = int(os.getenv(“BM25_K”, 5))
    FAISS_K = int(os.getenv(“FAISS_K”, 10))

    Globals (loaded at startup)

    DOCS: List[Document] = []
    BM25: Optional[BM25Retriever] = None
    FAISS_RET = None
    ENTRIES_COUNT = 0

    ———- I/O: Load JSONs ———-

    def _resolve_paths(paths: List[str]) -> List[Path]:
    out: List[Path] = []
    for p in paths:
    pp = Path(p)
    if not pp.exists():
    alt = Path(“/mnt/data”) / pp.name
    if alt.exists():
    pp = alt
    out.append(pp)
    return out

    def load_entries(paths: List[Path]) -> List[Dict[str, Any]]:
    “””Load entries from various JSON shapes and normalize to a common schema.”””
    entries: List[Dict[str, Any]] = []

    def norm(w: Dict[str, Any]) -> Dict[str, Any]:
        if not isinstance(w, dict) or not w.get("word"):
            return {}
        return {
            "word": w.get("word", ""),
            "pos": w.get("pos", ""),
            "definition_en": w.get("definition_en", ""),
            "example_en": w.get("example_en", ""),
            "definition_target": w.get("definition_target", ""),
            "example_target": w.get("example_target", ""),
            "target_lang": w.get("target_lang", "") or w.get("targetLanguage", ""),
            "rarity": w.get("rarity", 3),
            "confidence": w.get("confidence", 0.0),
            "prefix": w.get("prefix", ""),
        }
    
    for p in paths:
        if not p.exists():
            continue
        try:
            data = json.loads(p.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"[WARN] Failed to read {p}: {e}")
            continue
    
        added = 0
        if isinstance(data, dict):
            if isinstance(data.get("words"), list):
                for w in data["words"]:
                    e = norm(w)
                    if e:
                        entries.append(e)
                        added += 1
            if isinstance(data.get("entries"), list):
                for w in data["entries"]:
                    e = norm(w)
                    if e:
                        entries.append(e)
                        added += 1
            if isinstance(data.get("documents"), list):
                # LangChain export style {"documents": [{page_content, metadata}, ...]}
                for d in data["documents"]:
                    if not isinstance(d, dict):
                        continue
                    meta = d.get("metadata") or {}
                    pc = (d.get("page_content") or "")
                    lines = [ln.strip() for ln in pc.splitlines() if ln.strip()]
                    bucket = {}
                    for ln in lines:
                        low = ln.lower()
                        if low.startswith("word:"):
                            bucket["word"] = ln.split(":", 1)[1].strip()
                        elif low.startswith("part of speech:") or low.startswith("pos:"):
                            bucket["pos"] = ln.split(":", 1)[1].strip()
                        elif low.startswith("english definition:") or low.startswith("en:"):
                            bucket["definition_en"] = ln.split(":", 1)[1].strip()
                        elif low.startswith("english example:") or low.startswith("ex:"):
                            bucket["example_en"] = ln.split(":", 1)[1].strip()
                        elif low.startswith("korean definition:") or low.startswith("ko:"):
                            bucket["definition_target"] = ln.split(":", 1)[1].strip()
                        elif low.startswith("korean example:") or low.startswith("ko_ex:"):
                            bucket["example_target"] = ln.split(":", 1)[1].strip()
                    e = {
                        "word": meta.get("word", "") or bucket.get("word", ""),
                        "pos": meta.get("pos", "") or bucket.get("pos", ""),
                        "definition_en": bucket.get("definition_en", ""),
                        "example_en": bucket.get("example_en", ""),
                        "definition_target": bucket.get("definition_target", ""),
                        "example_target": bucket.get("example_target", ""),
                        "target_lang": meta.get("target_language", meta.get("target_lang", "")) or "ko",
                        "rarity": meta.get("rarity", 3),
                        "confidence": meta.get("confidence", 0.0),
                        "prefix": meta.get("prefix", ""),
                    }
                    if e.get("word"):
                        entries.append(e)
                        added += 1
        elif isinstance(data, list):
            for w in data:
                e = norm(w)
                if e:
                    entries.append(e)
                    added += 1
        print(f"[LOAD] {p.name}: {added} entries")
    return entries

    ———- Convert to LangChain Documents ———-

    def to_documents(entries: List[Dict[str, Any]]) -> List[Document]:
    docs: List[Document] = []
    for e in entries:
    word = (e.get(“word”) or “”).strip()
    if not word:
    continue
    pos = e.get(“pos”, “”)
    de = e.get(“definition_en”, “”)
    ee = e.get(“example_en”, “”)
    dt = e.get(“definition_target”, “”)
    et = e.get(“example_target”, “”)
    content_lines = [
    f”{word} ({pos})”.strip(),
    f”EN: {de}”.strip(),
    f”EX: {ee}”.strip(),
    ]
    if dt:
    content_lines.append(f”KO: {dt}”)
    if et:
    content_lines.append(f”KO_EX: {et}”)
    page_content = “\n”.join([ln for ln in content_lines if ln and ln != “()”])
    metadata = {
    “word”: word,
    “pos”: pos,
    “prefix”: e.get(“prefix”, “”),
    “rarity”: e.get(“rarity”, 3),
    “confidence”: e.get(“confidence”, 0.0),
    “target_lang”: e.get(“target_lang”, “”) or “ko”,
    }
    docs.append(Document(page_content=page_content, metadata=metadata))
    return docs

    ———- Utilities ———-

    def _has_hangul(text: str) -> bool:
    return any(‘\u3131’ <= ch <= ‘\ud7a3’ for ch in text)

    def substring_fallback(query: str, docs: List[Document], k: int = 10) -> List[Document]: terms = [t for t in re.split(r”\s+”, query.strip()) if t] if not terms: return [] scored: List[Tuple[int, Document]] = [] for d in docs: pc = d.page_content or “” score = sum(pc.count(t) for t in terms) if score > 0: scored.append((score, d)) scored.sort(key=lambda x: x[0], reverse=True) return [d for , d in scored[:k]]

    def _set_k_if_possible(retriever, k: int):
    try:
    retriever.k = k
    except Exception:
    try:
    retriever.search_kwargs[“k”] = k
    except Exception:
    pass

    def _invoke_safe(retriever, q: str) -> List[Document]:
    try:
    return retriever.invoke(q)
    except Exception:
    return retriever.get_relevant_documents(q)

    def _parse_fields(doc: Document) -> Dict[str, Any]:
    meta = doc.metadata or {}
    out = {
    “word”: meta.get(“word”, “”),
    “pos”: meta.get(“pos”, “”),
    “prefix”: meta.get(“prefix”, “”),
    “confidence”: meta.get(“confidence”, 0.0),
    “definition_en”: “”,
    “example_en”: “”,
    “definition_ko”: “”,
    “example_ko”: “”,
    }
    for ln in (doc.page_content or “”).splitlines():
    if ln.startswith(“EN:”):
    out[“definition_en”] = ln[3:].strip()
    elif ln.startswith(“EX:”):
    out[“example_en”] = ln[3:].strip()
    elif ln.startswith(“KO:”):
    out[“definition_ko”] = ln[3:].strip()
    elif ln.startswith(“KO_EX:”):
    out[“example_ko”] = ln[6:].strip()
    return out

    ———- Build Indexes ———-

    def build_indexes(paths: List[str], bm25_k: int = BM25_K, faiss_k: int = FAISS_K, emb_model: str = EMB_MODEL):
    global DOCS, BM25, FAISS_RET, ENTRIES_COUNT
    files = _resolve_paths(paths)
    entries = load_entries(files)
    if not entries:
    raise RuntimeError(“No entries loaded. Check DICT_PATHS or file paths.”)
    DOCS = to_documents(entries)
    ENTRIES_COUNT = len(DOCS)

    # BM25
    BM25 = BM25Retriever.from_documents(DOCS)
    BM25.k = bm25_k
    
    # FAISS
    embed = HFEmb(model_name=emb_model)
    vs = FAISS.from_documents(DOCS, embed)
    FAISS_RET = vs.as_retriever(search_kwargs={"k": faiss_k})

    ———- Search Logic (manual merge for dynamic weights) ———-

    def ensemble_search(q: str, k: int = 10, w_bm25: float = 0.6, w_faiss: float = 0.4) -> List[Dict[str, Any]]:
    if BM25 is None or FAISS_RET is None:
    raise RuntimeError(“Indexes not built yet”)

    # Adjust per-query k
    _set_k_if_possible(BM25, max(k, BM25_K))
    _set_k_if_possible(FAISS_RET, max(k, FAISS_K))
    
    bm25_docs = _invoke_safe(BM25, q)
    faiss_docs = _invoke_safe(FAISS_RET, q)
    
    # Hangul-aware substring booster
    sub_docs: List[Document] = []
    sub_weight = 0.15
    if _has_hangul(q):
        sub_docs = _substring_fallback(q, DOCS, k=max(10, k))
        sub_weight = 0.5
    
    # Rank maps
    def rank_map(docs: List[Document]) -> Dict[str, int]:
        m = {}
        for i, d in enumerate(docs):
            w = (d.metadata or {}).get("word", "")
            if w and w not in m:
                m[w] = i
        return m
    
    r_bm = rank_map(bm25_docs)
    r_fa = rank_map(faiss_docs)
    
    # Substring counts for normalization
    sub_counts: Dict[str, int] = {}
    max_sub = 0
    if sub_docs:
        terms = [t for t in re.split(r"\s+", q.strip()) if t]
        for d in sub_docs:
            pc = d.page_content or ""
            cnt = sum(pc.count(t) for t in terms)
            w = (d.metadata or {}).get("word", "")
            if w:
                sub_counts[w] = cnt
                max_sub = max(max_sub, cnt)
    
    # Collect pool
    pool: Dict[str, Tuple[float, Document]] = {}
    for d in sub_docs + bm25_docs + faiss_docs:
        w = (d.metadata or {}).get("word", "")
        if not w:
            # fallback to first line
            w = (d.page_content or "").splitlines()[0].split(" ")[0]
        bm_score = w_bm25 * (1.0 / (1 + r_bm.get(w, 1_000_000_000)))
        fa_score = w_faiss * (1.0 / (1 + r_fa.get(w, 1_000_000_000)))
        sub_score = 0.0
        if max_sub > 0 and w in sub_counts:
            sub_score = sub_weight * (sub_counts[w] / max_sub)
        score = bm_score + fa_score + sub_score
        prev = pool.get(w)
        if (prev is None) or (score > prev[0]):
            pool[w] = (score, d)
    
    ordered = sorted(pool.values(), key=lambda x: x[0], reverse=True)
    top = [d for _, d in ordered[:k]]
    return [_parse_fields(d) for d in top]

    ———- API ———-

    class SearchResponse(BaseModel):
    query: str
    k: int
    results: List[Dict[str, Any]]

    class HealthResponse(BaseModel):
    status: str = Field(“ok”)
    entries: int
    bm25_k: int
    faiss_k: int
    emb_model: str

    app = FastAPI(title=”Dictionary Search API”, version=”1.0.0″)

    @app.on_event(“startup”)
    def _startup():
    paths_env = os.getenv(“DICT_PATHS”)
    if paths_env:
    paths = [p.strip() for p in paths_env.split(“,”) if p.strip()]
    else:
    paths = DEFAULT_FILES
    print(f”[STARTUP] Loading files: {paths}”)
    build_indexes(paths, bm25_k=BM25_K, faiss_k=FAISS_K, emb_model=EMB_MODEL)
    print(f”[READY] Loaded {ENTRIES_COUNT} entries. Model={EMB_MODEL}”)

    @app.get(“/health”, response_model=HealthResponse)
    def health():
    return HealthResponse(entries=ENTRIES_COUNT, bm25_k=BM25_K, faiss_k=FAISS_K, emb_model=EMB_MODEL)

    @app.get(“/search”, response_model=SearchResponse)
    def search(
    q: str = Query(…, min_length=1, description=”Search query (KR/EN supported)”),
    k: int = Query(10, ge=1, le=50),
    w_bm25: float = Query(None, description=”BM25 weight (0-1)”),
    w_faiss: float = Query(None, description=”FAISS weight (0-1)”),
    ):
    if BM25 is None or FAISS_RET is None:
    raise HTTPException(status_code=503, detail=”Indexes not ready”)

    # Sensible defaults depending on language
    if w_bm25 is None or w_faiss is None:
        if _has_hangul(q):
            w_bm25, w_faiss = 0.3, 0.7
        else:
            w_bm25, w_faiss = 0.6, 0.4
    
    results = ensemble_search(q, k=k, w_bm25=w_bm25, w_faiss=w_faiss)
    return SearchResponse(query=q, k=k, results=results)

    ————- CLI entry (optional for quick test) ————-

    if name == “main“:
    # Quick local test without uvicorn: python app_dict_search_fastapi.py (then manual queries)
    paths_env = os.getenv(“DICT_PATHS”)
    if paths_env:
    paths = [p.strip() for p in paths_env.split(“,”) if p.strip()]
    else:
    paths = DEFAULT_FILES
    build_indexes(paths, bm25_k=BM25_K, faiss_k=FAISS_K, emb_model=EMB_MODEL)
    print(“Ready. Type your query (empty to exit).\n”)
    while True:
    try:
    q = input(“query> “).strip()
    except (EOFError, KeyboardInterrupt):
    break
    if not q:
    break
    res = ensemble_search(q, k=10)
    for i, r in enumerate(res, 1):
    print(f”[{i}] {r[‘word’]} ({r[‘pos’]}) | EN={r[‘definition_en’][:50]} | KO={r[‘definition_ko’][:50]}”)