naver.HOW data lake by AI

python2cpp

작성자

in"의 한국어 번역은 "안"입니다.

HTTP는 libcurl, JSON은 nlohmann/json(헤더온리) 를 사용합니다.
시그널 처리(CTRL+C), 진행상황 저장/복구, 접두사 기반 수집, Ollama /api/generate 호출·재시도, 통계 계산까지 그대로 구현했습니다.

빌드 예시

# (Ubuntu/Mac) libcurl과 nlohmann/json 설치 후 컴파일
# nlohmann/json은 단일 헤더만 있으면 됩니다. (/usr/include/nlohmann/json.hpp 경로 또는 로컬 포함)
g++ -std=c++17 -O2 -lcurl robust_dictionary_generator.cpp -o robust_dict

실행 예시

./robust_dict --mode 2letter --batch 35 --model gpt-oss:120b --resume
# 처음부터 새로: ./robust_dict --clean

// robust_dictionary_generator.cpp
// C++17 / libcurl / nlohmann::json 필요
// 기능: Ollama에 접두사별 단어 요청 -> 필터링 -> 진행상황 저장/재개 -> 최종 JSON 산출

#include <bits/stdc++.h>
#include <curl/curl.h>
#include <nlohmann/json.hpp>

using json = nlohmann::json;
namespace fs = std::filesystem;
using namespace std;

static atomic<bool> g_shutdown{false};

extern "C" void signal_handler(int) {
    g_shutdown.store(true);
    cerr << "\n시그널 수신. 안전하게 종료합니다..." << endl;
}

static size_t curlWriteCB(void* contents, size_t size, size_t nmemb, void* userp) {
    size_t total = size * nmemb;
    ((string*)userp)->append((char*)contents, total);
    return total;
}

static string to_lower_copy(string s) {
    transform(s.begin(), s.end(), s.begin(), [](unsigned char c){return (char)tolower(c);});
    return s;
}

static string now_iso8601() {
    using namespace std::chrono;
    auto tp = system_clock::now();
    time_t t = system_clock::to_time_t(tp);
    std::tm tm{};
#ifdef _WIN32
    localtime_s(&tm, &t);
#else
    localtime_r(&t, &tm);
#endif
    char buf[64];
    strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", &tm);
    return string(buf);
}

static string timestamp_compact() {
    using namespace std::chrono;
    auto tp = system_clock::now();
    time_t t = system_clock::to_time_t(tp);
    std::tm tm{};
#ifdef _WIN32
    localtime_s(&tm, &t);
#else
    localtime_r(&t, &tm);
#endif
    char buf[64];
    strftime(buf, sizeof(buf), "%Y%m%d_%H%M%S", &tm);
    return string(buf);
}

struct WordData {
    string word;
    string prefix;
    string collected_at;
    int length{};
};

class RobustDictionaryGenerator {
public:
    string model = "gpt-oss:120b";
    string host  = "localhost";
    int    port  = 11434;
    string base_url;
    string progress_file = "collection_progress.json";

    int min_word_length = 3;
    int max_word_length = 20;

    vector<regex> invalid_patterns;
    regex english_regex;

    RobustDictionaryGenerator(const string& model_, const string& host_, int port_)
        : model(model_), host(host_), port(port_),
          english_regex(R"(^[a-z]+[a-z\'-]*[a-z]+$|^[a-z]{3,}$)", std::regex::icase)
    {
        base_url = "http://" + host + ":" + to_string(port) + "/api/generate";
        invalid_patterns = {
            regex(R"(^[a-z]$)", std::regex::icase),       // 한 글자
            regex(R"(^\d)", std::regex::icase),           // 숫자 시작
            regex(R"([^\w\'-])", std::regex::icase),      // 허용외 특수문자
            regex(R"(^(the|a|an|and|or|but|in|on|at|to|for|of|with|by)$)", std::regex::icase)
        };
    }

    bool is_valid_word(const string& word_raw, const string& prefix) {
        if (word_raw.empty()) return false;
        string word = to_lower_copy(word_raw);
        if ((int)word.size() < min_word_length || (int)word.size() > max_word_length) return false;

        string pl = to_lower_copy(prefix);
        if (word.rfind(pl, 0) != 0) return false; // starts_with

        for (auto& pat : invalid_patterns) {
            if (regex_search(word, pat)) return false;
        }
        if (!regex_match(word, english_regex)) return false;
        return true;
    }

    string ask_ollama(const string& prompt, long timeout_sec = 45, int retries = 3) {
        for (int attempt = 0; attempt < retries; ++attempt) {
            if (g_shutdown.load()) return "";

            CURL* curl = curl_easy_init();
            if (!curl) {
                cerr << "CURL 초기화 실패\n";
                this_thread::sleep_for(chrono::seconds(2));
                continue;
            }

            string response_str;
            string payload = json{
                {"model", model},
                {"prompt", prompt},
                {"stream", false},
                {"options", {{"temperature", 0.1}, {"top_p", 0.9}, {"repeat_penalty", 1.1}}}
            }.dump();

            struct curl_slist* headers = nullptr;
            headers = curl_slist_append(headers, "Content-Type: application/json");

            curl_easy_setopt(curl, CURLOPT_URL, base_url.c_str());
            curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
            curl_easy_setopt(curl, CURLOPT_POSTFIELDS, payload.c_str());
            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriteCB);
            curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_str);
            curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout_sec);

            CURLcode res = curl_easy_perform(curl);
            long http_code = 0;
            curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);

            curl_slist_free_all(headers);
            curl_easy_cleanup(curl);

            if (res == CURLE_OPERATION_TIMEDOUT) {
                cerr << "타임아웃 (시도 " << (attempt+1) << "/" << retries << ")\n";
                if (attempt < retries - 1) {
                    this_thread::sleep_for(chrono::seconds(5));
                    timeout_sec += 15;
                }
                continue;
            }
            if (res != CURLE_OK) {
                cerr << "연결/요청 오류 (시도 " << (attempt+1) << "/" << retries
                     << "): " << curl_easy_strerror(res) << "\n";
                if (attempt < retries - 1) this_thread::sleep_for(chrono::seconds(10));
                continue;
            }
            if (http_code < 200 || http_code >= 300) {
                cerr << "HTTP 코드 " << http_code << " (시도 " << (attempt+1) << "/" << retries << ")\n";
                if (attempt < retries - 1) this_thread::sleep_for(chrono::seconds(3));
                continue;
            }

            try {
                auto j = json::parse(response_str);
                if (j.contains("response") && j["response"].is_string())
                    return j["response"].get<string>();
            } catch (const std::exception& e) {
                cerr << "API 응답 파싱 오류: " << e.what() << "\n";
                if (attempt < retries - 1) this_thread::sleep_for(chrono::seconds(3));
                continue;
            }
        }
        return "";
    }

    void save_progress(const vector<WordData>& collected_words,
                       const unordered_set<string>& completed_prefixes,
                       const unordered_set<string>& failed_prefixes) {
        json progress;
        progress["timestamp"] = now_iso8601();
        progress["collected_words"] = json::array();
        for (const auto& w : collected_words) {
            progress["collected_words"].push_back({
                {"word", w.word},
                {"prefix", w.prefix},
                {"collected_at", w.collected_at},
                {"length", w.length}
            });
        }
        progress["completed_prefixes"] = json::array();
        for (auto& p : completed_prefixes) progress["completed_prefixes"].push_back(p);
        progress["failed_prefixes"] = json::array();
        for (auto& p : failed_prefixes) progress["failed_prefixes"].push_back(p);
        progress["total_collected"] = (int)collected_words.size();
        progress["model_used"] = model;

        string backup = progress_file + ".backup";
        try {
            if (fs::exists(progress_file)) {
                error_code ec;
                fs::rename(progress_file, backup, ec);
            }
        } catch (...) {}

        try {
            ofstream ofs(progress_file);
            ofs << setw(2) << progress;
            ofs.close();
            if (fs::exists(backup)) fs::remove(backup);
        } catch (const std::exception& e) {
            cerr << "진행 상황 저장 실패: " << e.what() << "\n";
            if (fs::exists(backup)) {
                error_code ec;
                fs::rename(backup, progress_file, ec);
            }
        }
    }

    optional<json> load_progress() {
        if (!fs::exists(progress_file)) return nullopt;
        try {
            ifstream ifs(progress_file);
            json j; ifs >> j; ifs.close();
            if (j.contains("model_used") && j["model_used"].is_string()) {
                string prev_model = j["model_used"].get<string>();
                if (prev_model != model) {
                    cerr << "경고: 다른 모델로 수집된 데이터입니다. (" << prev_model << " -> " << model << ")\n";
                    cerr << "계속하시겠습니까? (y/N): ";
                    string ans; getline(cin, ans);
                    if (to_lower_copy(ans) != "y") return nullopt;
                }
            }
            return j;
        } catch (const std::exception& e) {
            cerr << "진행 상황 로드 실패: " << e.what() << "\n";
            return nullopt;
        }
    }

    vector<string> generate_prefixes(const string& mode) {
        vector<string> prefixes;
        if (mode == "2letter") {
            string common_first = "stpbcmdrhlfgwyvnkjqxz";
            string common_second= "aeiouhrlnstmdcpgbykvwfjqxz";

            for (char a : common_first)
                for (char b : common_second)
                    prefixes.push_back(string(1,a) + string(1,b));

            for (char a = 'a'; a <= 'z'; ++a) {
                for (char b = 'a'; b <= 'z'; ++b) {
                    string combo{a,b};
                    if (find(prefixes.begin(), prefixes.end(), combo) == prefixes.end())
                        prefixes.push_back(combo);
                }
            }
        } else if (mode == "adaptive") {
            for (char c = 'a'; c <= 'z'; ++c) prefixes.push_back(string(1,c));
            vector<string> common_combinations = {
                "th","he","in","er","an","re","ed","nd","ou","ea","ti","to","it","st","io","le",
                "is","on","al","ar","at","se","ng","me","de","of","te","en","ty","ch","co","di",
                "ho","li","ma","ne","pe","ro","so","tr"
            };
            for (auto& c : common_combinations)
                if (find(prefixes.begin(), prefixes.end(), c) == prefixes.end())
                    prefixes.push_back(c);
            vector<string> common_prefixes = {"un","pre","dis","con","pro","anti","sub","inter"};
            prefixes.insert(prefixes.end(), common_prefixes.begin(), common_prefixes.end());
        }
        return prefixes;
    }

    vector<WordData> parse_prefix_words(const string& response, const string& prefix) {
        vector<WordData> out;
        if (response.empty()) return out;

        unordered_set<string> seen;
        istringstream iss(response);
        string line;

        regex leading(R"(^[\d\-\*\.\s]*)");
        regex after_nonword(R"([^A-Za-z'\-].*$)");

        while (getline(iss, line)) {
            string s = line;
            // trim leading numbering/symbols
            s = regex_replace(s, leading, "");
            // lower
            s = to_lower_copy(s);
            // cut explanations
            s = regex_replace(s, after_nonword, "");
            // trim spaces
            while (!s.empty() && isspace((unsigned char)s.back())) s.pop_back();

            if (s.empty()) continue;
            if (seen.count(s)) continue;
            if (!is_valid_word(s, prefix)) continue;

            seen.insert(s);
            out.push_back(WordData{
                s,
                prefix,
                now_iso8601(),
                (int)s.size()
            });
        }
        return out;
    }

    vector<WordData> collect_words_by_prefix(const string& prefix, int batch_size) {
        ostringstream oss;
        oss << "List exactly " << batch_size << " common English words that start with '" << prefix << "'.\n"
            << "Requirements:\n"
            << "- One word per line\n"
            << "- Only real English words\n"
            << "- No proper nouns or abbreviations\n"
            << "- Words must be at least 3 letters long\n"
            << "- No explanations or numbers\n\n"
            << "Example format:\n"
            << prefix << "ace\n"
            << prefix << "ample\n"
            << prefix << "ound\n";

        cout << "'" << prefix << "' 접두사 단어 수집 중..." << flush;
        if (g_shutdown.load()) { cout << " 0개\n"; return {}; }

        string resp = ask_ollama(oss.str(), 35, 3);
        auto words = parse_prefix_words(resp, prefix);
        cout << " " << words.size() << "개" << endl;
        return words;
    }

    string finalize_collection(const vector<WordData>& all_words, const string& mode) {
        if (all_words.empty()) {
            cout << "수집된 단어가 없습니다." << endl;
            return "";
        }
        // unique by word
        unordered_map<string, WordData> uniq;
        for (const auto& w : all_words) {
            if (!uniq.count(w.word)) uniq[w.word] = w;
        }

        vector<WordData> final_words;
        final_words.reserve(uniq.size());
        for (auto& kv : uniq) final_words.push_back(kv.second);
        sort(final_words.begin(), final_words.end(),
             [](const WordData& a, const WordData& b){ return a.word < b.word; });

        vector<int> lens; lens.reserve(final_words.size());
        map<char,int> letter_stats;
        unordered_map<string,int> prefix_stats;

        for (auto& w : final_words) {
            lens.push_back(w.length);
            if (!w.word.empty()) letter_stats[toupper((unsigned char)w.word[0])] += 1;
            prefix_stats[w.prefix] += 1;
        }

        vector<pair<string,int>> prefix_sorted(prefix_stats.begin(), prefix_stats.end());
        sort(prefix_sorted.begin(), prefix_sorted.end(),
             [](auto& a, auto& b){ return a.second > b.second; });

        json letter_dist = json::object();
        for (auto& kv : letter_stats) letter_dist[string(1,kv.first)] = kv.second;

        json top_prefixes = json::object();
        for (size_t i=0; i<min<size_t>(10, prefix_sorted.size()); ++i)
            top_prefixes[prefix_sorted[i].first] = prefix_sorted[i].second;

        int minv = *min_element(lens.begin(), lens.end());
        int maxv = *max_element(lens.begin(), lens.end());
        double avgv = accumulate(lens.begin(), lens.end(), 0.0) / (double)lens.size();

        string ts = timestamp_compact();
        string filename = "robust_wordlist_" + mode + "_" + ts + ".json";

        json out;
        out["metadata"] = {
            {"stage", 1},
            {"type", "robust_wordlist"},
            {"title", string("Robust English Wordlist (") + mode + " mode)"},
            {"model_used", model},
            {"collection_mode", mode},
            {"total_words", (int)final_words.size()},
            {"duplicates_removed", (int)all_words.size() - (int)final_words.size()},
            {"statistics", {
                {"letter_distribution", letter_dist},
                {"word_length_stats", {
                    {"min", minv},
                    {"max", maxv},
                    {"avg", avgv}
                }},
                {"top_prefixes", top_prefixes}
            }},
            {"created_at", now_iso8601()}
        };
        out["words"] = json::array();
        for (auto& w : final_words) {
            out["words"].push_back({
                {"word", w.word},
                {"prefix", w.prefix},
                {"collected_at", w.collected_at},
                {"length", w.length}
            });
        }

        ofstream ofs(filename);
        ofs << setw(2) << out;
        ofs.close();

        if (fs::exists(progress_file)) {
            string completed = "completed_" + ts + "_" + progress_file;
            error_code ec;
            fs::rename(progress_file, completed, ec);
            if (!ec) {
                cout << "진행 상황 파일을 " << completed << " 로 이동했습니다." << endl;
            }
        }

        // 요약 출력
        auto top_letter = max_element(letter_stats.begin(), letter_stats.end(),
                                      [](auto& a, auto& b){ return a.second < b.second; });

        cout << "\n수집 완료!\n";
        cout << "파일: " << filename << "\n";
        cout << "총 단어 수: " << final_words.size() << "개\n";
        cout << "중복 제거: " << (all_words.size() - final_words.size()) << "개\n";
        cout << "평균 단어 길이: " << fixed << setprecision(1) << avgv << "자\n";
        if (top_letter != letter_stats.end()) {
            cout << "가장 많은 글자: " << top_letter->first << " (" << top_letter->second << "개)\n";
        }

        return filename;
    }

    string process_prefixes(const vector<string>& prefixes,
                            vector<WordData>& all_words,
                            unordered_set<string>& completed_prefixes,
                            unordered_set<string>& failed_prefixes,
                            int batch_size,
                            const string& mode,
                            size_t total_prefixes) {
        size_t start_completed = completed_prefixes.size();

        for (size_t i = 0; i < prefixes.size(); ++i) {
            if (g_shutdown.load()) {
                cout << "\n안전한 종료 요청됨. 진행 상황을 저장합니다..." << endl;
                break;
            }
            const string& p = prefixes[i];
            try {
                auto words = collect_words_by_prefix(p, batch_size);
                if (!words.empty()) {
                    all_words.insert(all_words.end(), words.begin(), words.end());
                    completed_prefixes.insert(p);
                } else {
                    failed_prefixes.insert(p);
                }

                size_t done_now = start_completed + (i+1);
                if ((i+1) % 10 == 0) {
                    double avg = (done_now>0) ? (double)all_words.size() / (double)done_now : 0.0;
                    cout << "    진행률: " << done_now << "/" << total_prefixes
                         << " (" << fixed << setprecision(1)
                         << (100.0 * done_now / (double)total_prefixes) << "%)"
                         << " - 총 " << all_words.size() << "개 단어"
                         << " (평균 " << setprecision(1) << avg << "개/접두사)\n";
                }
                if ((i+1) % 25 == 0) {
                    save_progress(all_words, completed_prefixes, failed_prefixes);
                }
                this_thread::sleep_for(chrono::milliseconds(500));
            } catch (const std::exception& e) {
                cerr << "오류 (" << p << "): " << e.what() << "\n";
                failed_prefixes.insert(p);
            }
        }

        save_progress(all_words, completed_prefixes, failed_prefixes);
        return finalize_collection(all_words, mode);
    }

    string resume_collection(const string& mode, int batch_size) {
        cout << "=== 중단된 수집 재개 ===\n";
        auto pj = load_progress();
        if (!pj.has_value()) {
            cout << "저장된 진행 상황을 찾을 수 없습니다. 새로 시작합니다.\n";
            return collect_massive_wordlist(mode, batch_size, false);
        }
        auto& progress = *pj;
        cout << "저장된 진행 상황 발견:\n";
        cout << "  이전 수집 시간: " << progress["timestamp"].get<string>() << "\n";
        cout << "  수집된 단어 수: " << progress["total_collected"].get<int>() << "개\n";
        cout << "  완료된 접두사: " << progress["completed_prefixes"].size() << "개\n";
        cout << "  실패한 접두사: " << progress["failed_prefixes"].size() << "개\n";

        cout << "\n저장된 지점부터 재개하시겠습니까? (y/N): ";
        string ans; getline(cin, ans);
        if (to_lower_copy(ans) != "y") {
            return collect_massive_wordlist(mode, batch_size, false);
        }

        vector<string> all_prefixes = generate_prefixes(mode);

        unordered_set<string> completed_prefixes, failed_prefixes;
        for (auto& p : progress["completed_prefixes"]) completed_prefixes.insert(p.get<string>());
        for (auto& p : progress["failed_prefixes"])    failed_prefixes.insert(p.get<string>());

        vector<WordData> all_words;
        for (auto& w : progress["collected_words"]) {
            all_words.push_back(WordData{
                w["word"].get<string>(),
                w["prefix"].get<string>(),
                w["collected_at"].get<string>(),
                w["length"].get<int>()
            });
        }

        vector<string> remaining;
        remaining.reserve(all_prefixes.size());
        for (auto& p : all_prefixes) if (!completed_prefixes.count(p)) remaining.push_back(p);

        cout << "\n재개 정보:\n";
        cout << "  남은 접두사: " << remaining.size() << "개\n";
        cout << "  예상 추가 단어: " << (remaining.size() * (size_t)batch_size) << "개\n";
        cout << string(50, '-') << "\n";

        return process_prefixes(remaining, all_words, completed_prefixes, failed_prefixes,
                                batch_size, mode, all_prefixes.size());
    }

    string collect_massive_wordlist(const string& mode, int batch_size, bool resume=true) {
        if (resume && fs::exists(progress_file)) {
            return resume_collection(mode, batch_size);
        }
        cout << "=== 새로운 대용량 영어 단어 수집 시작 ===\n";
        cout << "모드: " << mode << "\n";
        cout << "배치 크기: " << batch_size << "\n";
        cout << "모델: " << model << "\n";
        cout << string(50, '-') << "\n";

        auto prefixes = generate_prefixes(mode);
        cout << "총 접두사 수: " << prefixes.size() << "\n";
        cout << "예상 최대 단어 수: " << (size_t)prefixes.size() * (size_t)batch_size << "\n";

        vector<WordData> all_words;
        unordered_set<string> completed_prefixes, failed_prefixes;

        return process_prefixes(prefixes, all_words, completed_prefixes, failed_prefixes,
                                batch_size, mode, prefixes.size());
    }
};

struct Args {
    int stage = 1;
    string model = "gpt-oss:120b";
    string mode = "2letter"; // or "adaptive"
    int batch = 35;
    bool resume = false;
    bool clean = false;
    int min_len = 3;
    int max_len = 20;
    string host = "localhost";
    int port = 11434;
};

static void print_help() {
    cout << "사용법: robust_dict [옵션]\n"
         << "  --stage <int>            (지원: 1)\n"
         << "  --model <name>           (기본: gpt-oss:120b)\n"
         << "  --mode <2letter|adaptive>(기본: 2letter)\n"
         << "  --batch <int>            (기본: 35)\n"
         << "  --resume                 (중단 지점부터 재개)\n"
         << "  --clean                  (진행 파일 삭제 후 새로 시작)\n"
         << "  --min-length <int>       (기본: 3)\n"
         << "  --max-length <int>       (기본: 20)\n"
         << "  --host <str>             (기본: localhost)\n"
         << "  --port <int>             (기본: 11434)\n";
}

static bool parse_args(int argc, char** argv, Args& a) {
    for (int i=1; i<argc; ++i) {
        string k = argv[i];
        auto need = [&](int left){ return i+left < argc; };
        if (k == "--help" || k == "-h") { print_help(); exit(0); }
        else if (k == "--stage" && need(1)) { a.stage = stoi(argv[++i]); }
        else if (k == "--model" && need(1)) { a.model = argv[++i]; }
        else if (k == "--mode" && need(1))  { a.mode  = argv[++i]; }
        else if (k == "--batch" && need(1)) { a.batch = stoi(argv[++i]); }
        else if (k == "--resume") { a.resume = true; }
        else if (k == "--clean")  { a.clean  = true; }
        else if (k == "--min-length" && need(1)) { a.min_len = stoi(argv[++i]); }
        else if (k == "--max-length" && need(1)) { a.max_len = stoi(argv[++i]); }
        else if (k == "--host" && need(1)) { a.host = argv[++i]; }
        else if (k == "--port" && need(1)) { a.port = stoi(argv[++i]); }
        else {
            cerr << "알 수 없는 옵션: " << k << "\n";
            print_help();
            return false;
        }
    }
    return true;
}

int main(int argc, char** argv) {
    // 시그널 핸들러
    signal(SIGINT,  signal_handler);
#ifdef SIGTERM
    signal(SIGTERM, signal_handler);
#endif

    Args args;
    if (!parse_args(argc, argv, args)) return 1;
    if (args.stage != 1) {
        cerr << "현재는 stage 1만 지원합니다.\n";
        return 1;
    }

    if (args.clean && fs::exists("collection_progress.json")) {
        fs::remove("collection_progress.json");
        cout << "진행 상황 파일을 삭제했습니다. 새로 시작합니다.\n";
    }

    curl_global_init(CURL_GLOBAL_DEFAULT);

    RobustDictionaryGenerator gen(args.model, args.host, args.port);
    gen.min_word_length = args.min_len;
    gen.max_word_length = args.max_len;

    try {
        string result_file = gen.collect_massive_wordlist(args.mode, args.batch, !args.clean);
        if (!result_file.empty()) {
            cout << "\n다음 단계는 기존 011.py를 사용하세요:\n";
            cout << "python3 011.py --stage 2 --file " << result_file << "\n";
        }
    } catch (const std::exception& e) {
        cerr << "프로그램 오류: " << e.what() << "\n";
        curl_global_cleanup();
        return 1;
    }

    curl_global_cleanup();
    return 0;
}

필요시, Windows에선 vcpkg 등으로 curl과 nlohmann-json을 설치한 뒤 동일하게 빌드

코멘트

답글 남기기 응답 취소

더 많은 게시물