- HTTP는 libcurl, JSON은 nlohmann/json(헤더온리) 를 사용합니다.
- 시그널 처리(CTRL+C), 진행상황 저장/복구, 접두사 기반 수집, Ollama
/api/generate호출·재시도, 통계 계산까지 그대로 구현했습니다.
빌드 예시
# (Ubuntu/Mac) libcurl과 nlohmann/json 설치 후 컴파일
# nlohmann/json은 단일 헤더만 있으면 됩니다. (/usr/include/nlohmann/json.hpp 경로 또는 로컬 포함)
g++ -std=c++17 -O2 -lcurl robust_dictionary_generator.cpp -o robust_dict
실행 예시
./robust_dict --mode 2letter --batch 35 --model gpt-oss:120b --resume
# 처음부터 새로: ./robust_dict --clean
// robust_dictionary_generator.cpp
// C++17 / libcurl / nlohmann::json 필요
// 기능: Ollama에 접두사별 단어 요청 -> 필터링 -> 진행상황 저장/재개 -> 최종 JSON 산출
#include <bits/stdc++.h>
#include <curl/curl.h>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace fs = std::filesystem;
using namespace std;
static atomic<bool> g_shutdown{false};
extern "C" void signal_handler(int) {
g_shutdown.store(true);
cerr << "\n시그널 수신. 안전하게 종료합니다..." << endl;
}
static size_t curlWriteCB(void* contents, size_t size, size_t nmemb, void* userp) {
size_t total = size * nmemb;
((string*)userp)->append((char*)contents, total);
return total;
}
static string to_lower_copy(string s) {
transform(s.begin(), s.end(), s.begin(), [](unsigned char c){return (char)tolower(c);});
return s;
}
static string now_iso8601() {
using namespace std::chrono;
auto tp = system_clock::now();
time_t t = system_clock::to_time_t(tp);
std::tm tm{};
#ifdef _WIN32
localtime_s(&tm, &t);
#else
localtime_r(&t, &tm);
#endif
char buf[64];
strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", &tm);
return string(buf);
}
static string timestamp_compact() {
using namespace std::chrono;
auto tp = system_clock::now();
time_t t = system_clock::to_time_t(tp);
std::tm tm{};
#ifdef _WIN32
localtime_s(&tm, &t);
#else
localtime_r(&t, &tm);
#endif
char buf[64];
strftime(buf, sizeof(buf), "%Y%m%d_%H%M%S", &tm);
return string(buf);
}
struct WordData {
string word;
string prefix;
string collected_at;
int length{};
};
class RobustDictionaryGenerator {
public:
string model = "gpt-oss:120b";
string host = "localhost";
int port = 11434;
string base_url;
string progress_file = "collection_progress.json";
int min_word_length = 3;
int max_word_length = 20;
vector<regex> invalid_patterns;
regex english_regex;
RobustDictionaryGenerator(const string& model_, const string& host_, int port_)
: model(model_), host(host_), port(port_),
english_regex(R"(^[a-z]+[a-z\'-]*[a-z]+$|^[a-z]{3,}$)", std::regex::icase)
{
base_url = "http://" + host + ":" + to_string(port) + "/api/generate";
invalid_patterns = {
regex(R"(^[a-z]$)", std::regex::icase), // 한 글자
regex(R"(^\d)", std::regex::icase), // 숫자 시작
regex(R"([^\w\'-])", std::regex::icase), // 허용외 특수문자
regex(R"(^(the|a|an|and|or|but|in|on|at|to|for|of|with|by)$)", std::regex::icase)
};
}
bool is_valid_word(const string& word_raw, const string& prefix) {
if (word_raw.empty()) return false;
string word = to_lower_copy(word_raw);
if ((int)word.size() < min_word_length || (int)word.size() > max_word_length) return false;
string pl = to_lower_copy(prefix);
if (word.rfind(pl, 0) != 0) return false; // starts_with
for (auto& pat : invalid_patterns) {
if (regex_search(word, pat)) return false;
}
if (!regex_match(word, english_regex)) return false;
return true;
}
string ask_ollama(const string& prompt, long timeout_sec = 45, int retries = 3) {
for (int attempt = 0; attempt < retries; ++attempt) {
if (g_shutdown.load()) return "";
CURL* curl = curl_easy_init();
if (!curl) {
cerr << "CURL 초기화 실패\n";
this_thread::sleep_for(chrono::seconds(2));
continue;
}
string response_str;
string payload = json{
{"model", model},
{"prompt", prompt},
{"stream", false},
{"options", {{"temperature", 0.1}, {"top_p", 0.9}, {"repeat_penalty", 1.1}}}
}.dump();
struct curl_slist* headers = nullptr;
headers = curl_slist_append(headers, "Content-Type: application/json");
curl_easy_setopt(curl, CURLOPT_URL, base_url.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, payload.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriteCB);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_str);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout_sec);
CURLcode res = curl_easy_perform(curl);
long http_code = 0;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
if (res == CURLE_OPERATION_TIMEDOUT) {
cerr << "타임아웃 (시도 " << (attempt+1) << "/" << retries << ")\n";
if (attempt < retries - 1) {
this_thread::sleep_for(chrono::seconds(5));
timeout_sec += 15;
}
continue;
}
if (res != CURLE_OK) {
cerr << "연결/요청 오류 (시도 " << (attempt+1) << "/" << retries
<< "): " << curl_easy_strerror(res) << "\n";
if (attempt < retries - 1) this_thread::sleep_for(chrono::seconds(10));
continue;
}
if (http_code < 200 || http_code >= 300) {
cerr << "HTTP 코드 " << http_code << " (시도 " << (attempt+1) << "/" << retries << ")\n";
if (attempt < retries - 1) this_thread::sleep_for(chrono::seconds(3));
continue;
}
try {
auto j = json::parse(response_str);
if (j.contains("response") && j["response"].is_string())
return j["response"].get<string>();
} catch (const std::exception& e) {
cerr << "API 응답 파싱 오류: " << e.what() << "\n";
if (attempt < retries - 1) this_thread::sleep_for(chrono::seconds(3));
continue;
}
}
return "";
}
void save_progress(const vector<WordData>& collected_words,
const unordered_set<string>& completed_prefixes,
const unordered_set<string>& failed_prefixes) {
json progress;
progress["timestamp"] = now_iso8601();
progress["collected_words"] = json::array();
for (const auto& w : collected_words) {
progress["collected_words"].push_back({
{"word", w.word},
{"prefix", w.prefix},
{"collected_at", w.collected_at},
{"length", w.length}
});
}
progress["completed_prefixes"] = json::array();
for (auto& p : completed_prefixes) progress["completed_prefixes"].push_back(p);
progress["failed_prefixes"] = json::array();
for (auto& p : failed_prefixes) progress["failed_prefixes"].push_back(p);
progress["total_collected"] = (int)collected_words.size();
progress["model_used"] = model;
string backup = progress_file + ".backup";
try {
if (fs::exists(progress_file)) {
error_code ec;
fs::rename(progress_file, backup, ec);
}
} catch (...) {}
try {
ofstream ofs(progress_file);
ofs << setw(2) << progress;
ofs.close();
if (fs::exists(backup)) fs::remove(backup);
} catch (const std::exception& e) {
cerr << "진행 상황 저장 실패: " << e.what() << "\n";
if (fs::exists(backup)) {
error_code ec;
fs::rename(backup, progress_file, ec);
}
}
}
optional<json> load_progress() {
if (!fs::exists(progress_file)) return nullopt;
try {
ifstream ifs(progress_file);
json j; ifs >> j; ifs.close();
if (j.contains("model_used") && j["model_used"].is_string()) {
string prev_model = j["model_used"].get<string>();
if (prev_model != model) {
cerr << "경고: 다른 모델로 수집된 데이터입니다. (" << prev_model << " -> " << model << ")\n";
cerr << "계속하시겠습니까? (y/N): ";
string ans; getline(cin, ans);
if (to_lower_copy(ans) != "y") return nullopt;
}
}
return j;
} catch (const std::exception& e) {
cerr << "진행 상황 로드 실패: " << e.what() << "\n";
return nullopt;
}
}
vector<string> generate_prefixes(const string& mode) {
vector<string> prefixes;
if (mode == "2letter") {
string common_first = "stpbcmdrhlfgwyvnkjqxz";
string common_second= "aeiouhrlnstmdcpgbykvwfjqxz";
for (char a : common_first)
for (char b : common_second)
prefixes.push_back(string(1,a) + string(1,b));
for (char a = 'a'; a <= 'z'; ++a) {
for (char b = 'a'; b <= 'z'; ++b) {
string combo{a,b};
if (find(prefixes.begin(), prefixes.end(), combo) == prefixes.end())
prefixes.push_back(combo);
}
}
} else if (mode == "adaptive") {
for (char c = 'a'; c <= 'z'; ++c) prefixes.push_back(string(1,c));
vector<string> common_combinations = {
"th","he","in","er","an","re","ed","nd","ou","ea","ti","to","it","st","io","le",
"is","on","al","ar","at","se","ng","me","de","of","te","en","ty","ch","co","di",
"ho","li","ma","ne","pe","ro","so","tr"
};
for (auto& c : common_combinations)
if (find(prefixes.begin(), prefixes.end(), c) == prefixes.end())
prefixes.push_back(c);
vector<string> common_prefixes = {"un","pre","dis","con","pro","anti","sub","inter"};
prefixes.insert(prefixes.end(), common_prefixes.begin(), common_prefixes.end());
}
return prefixes;
}
vector<WordData> parse_prefix_words(const string& response, const string& prefix) {
vector<WordData> out;
if (response.empty()) return out;
unordered_set<string> seen;
istringstream iss(response);
string line;
regex leading(R"(^[\d\-\*\.\s]*)");
regex after_nonword(R"([^A-Za-z'\-].*$)");
while (getline(iss, line)) {
string s = line;
// trim leading numbering/symbols
s = regex_replace(s, leading, "");
// lower
s = to_lower_copy(s);
// cut explanations
s = regex_replace(s, after_nonword, "");
// trim spaces
while (!s.empty() && isspace((unsigned char)s.back())) s.pop_back();
if (s.empty()) continue;
if (seen.count(s)) continue;
if (!is_valid_word(s, prefix)) continue;
seen.insert(s);
out.push_back(WordData{
s,
prefix,
now_iso8601(),
(int)s.size()
});
}
return out;
}
vector<WordData> collect_words_by_prefix(const string& prefix, int batch_size) {
ostringstream oss;
oss << "List exactly " << batch_size << " common English words that start with '" << prefix << "'.\n"
<< "Requirements:\n"
<< "- One word per line\n"
<< "- Only real English words\n"
<< "- No proper nouns or abbreviations\n"
<< "- Words must be at least 3 letters long\n"
<< "- No explanations or numbers\n\n"
<< "Example format:\n"
<< prefix << "ace\n"
<< prefix << "ample\n"
<< prefix << "ound\n";
cout << "'" << prefix << "' 접두사 단어 수집 중..." << flush;
if (g_shutdown.load()) { cout << " 0개\n"; return {}; }
string resp = ask_ollama(oss.str(), 35, 3);
auto words = parse_prefix_words(resp, prefix);
cout << " " << words.size() << "개" << endl;
return words;
}
string finalize_collection(const vector<WordData>& all_words, const string& mode) {
if (all_words.empty()) {
cout << "수집된 단어가 없습니다." << endl;
return "";
}
// unique by word
unordered_map<string, WordData> uniq;
for (const auto& w : all_words) {
if (!uniq.count(w.word)) uniq[w.word] = w;
}
vector<WordData> final_words;
final_words.reserve(uniq.size());
for (auto& kv : uniq) final_words.push_back(kv.second);
sort(final_words.begin(), final_words.end(),
[](const WordData& a, const WordData& b){ return a.word < b.word; });
vector<int> lens; lens.reserve(final_words.size());
map<char,int> letter_stats;
unordered_map<string,int> prefix_stats;
for (auto& w : final_words) {
lens.push_back(w.length);
if (!w.word.empty()) letter_stats[toupper((unsigned char)w.word[0])] += 1;
prefix_stats[w.prefix] += 1;
}
vector<pair<string,int>> prefix_sorted(prefix_stats.begin(), prefix_stats.end());
sort(prefix_sorted.begin(), prefix_sorted.end(),
[](auto& a, auto& b){ return a.second > b.second; });
json letter_dist = json::object();
for (auto& kv : letter_stats) letter_dist[string(1,kv.first)] = kv.second;
json top_prefixes = json::object();
for (size_t i=0; i<min<size_t>(10, prefix_sorted.size()); ++i)
top_prefixes[prefix_sorted[i].first] = prefix_sorted[i].second;
int minv = *min_element(lens.begin(), lens.end());
int maxv = *max_element(lens.begin(), lens.end());
double avgv = accumulate(lens.begin(), lens.end(), 0.0) / (double)lens.size();
string ts = timestamp_compact();
string filename = "robust_wordlist_" + mode + "_" + ts + ".json";
json out;
out["metadata"] = {
{"stage", 1},
{"type", "robust_wordlist"},
{"title", string("Robust English Wordlist (") + mode + " mode)"},
{"model_used", model},
{"collection_mode", mode},
{"total_words", (int)final_words.size()},
{"duplicates_removed", (int)all_words.size() - (int)final_words.size()},
{"statistics", {
{"letter_distribution", letter_dist},
{"word_length_stats", {
{"min", minv},
{"max", maxv},
{"avg", avgv}
}},
{"top_prefixes", top_prefixes}
}},
{"created_at", now_iso8601()}
};
out["words"] = json::array();
for (auto& w : final_words) {
out["words"].push_back({
{"word", w.word},
{"prefix", w.prefix},
{"collected_at", w.collected_at},
{"length", w.length}
});
}
ofstream ofs(filename);
ofs << setw(2) << out;
ofs.close();
if (fs::exists(progress_file)) {
string completed = "completed_" + ts + "_" + progress_file;
error_code ec;
fs::rename(progress_file, completed, ec);
if (!ec) {
cout << "진행 상황 파일을 " << completed << " 로 이동했습니다." << endl;
}
}
// 요약 출력
auto top_letter = max_element(letter_stats.begin(), letter_stats.end(),
[](auto& a, auto& b){ return a.second < b.second; });
cout << "\n수집 완료!\n";
cout << "파일: " << filename << "\n";
cout << "총 단어 수: " << final_words.size() << "개\n";
cout << "중복 제거: " << (all_words.size() - final_words.size()) << "개\n";
cout << "평균 단어 길이: " << fixed << setprecision(1) << avgv << "자\n";
if (top_letter != letter_stats.end()) {
cout << "가장 많은 글자: " << top_letter->first << " (" << top_letter->second << "개)\n";
}
return filename;
}
string process_prefixes(const vector<string>& prefixes,
vector<WordData>& all_words,
unordered_set<string>& completed_prefixes,
unordered_set<string>& failed_prefixes,
int batch_size,
const string& mode,
size_t total_prefixes) {
size_t start_completed = completed_prefixes.size();
for (size_t i = 0; i < prefixes.size(); ++i) {
if (g_shutdown.load()) {
cout << "\n안전한 종료 요청됨. 진행 상황을 저장합니다..." << endl;
break;
}
const string& p = prefixes[i];
try {
auto words = collect_words_by_prefix(p, batch_size);
if (!words.empty()) {
all_words.insert(all_words.end(), words.begin(), words.end());
completed_prefixes.insert(p);
} else {
failed_prefixes.insert(p);
}
size_t done_now = start_completed + (i+1);
if ((i+1) % 10 == 0) {
double avg = (done_now>0) ? (double)all_words.size() / (double)done_now : 0.0;
cout << " 진행률: " << done_now << "/" << total_prefixes
<< " (" << fixed << setprecision(1)
<< (100.0 * done_now / (double)total_prefixes) << "%)"
<< " - 총 " << all_words.size() << "개 단어"
<< " (평균 " << setprecision(1) << avg << "개/접두사)\n";
}
if ((i+1) % 25 == 0) {
save_progress(all_words, completed_prefixes, failed_prefixes);
}
this_thread::sleep_for(chrono::milliseconds(500));
} catch (const std::exception& e) {
cerr << "오류 (" << p << "): " << e.what() << "\n";
failed_prefixes.insert(p);
}
}
save_progress(all_words, completed_prefixes, failed_prefixes);
return finalize_collection(all_words, mode);
}
string resume_collection(const string& mode, int batch_size) {
cout << "=== 중단된 수집 재개 ===\n";
auto pj = load_progress();
if (!pj.has_value()) {
cout << "저장된 진행 상황을 찾을 수 없습니다. 새로 시작합니다.\n";
return collect_massive_wordlist(mode, batch_size, false);
}
auto& progress = *pj;
cout << "저장된 진행 상황 발견:\n";
cout << " 이전 수집 시간: " << progress["timestamp"].get<string>() << "\n";
cout << " 수집된 단어 수: " << progress["total_collected"].get<int>() << "개\n";
cout << " 완료된 접두사: " << progress["completed_prefixes"].size() << "개\n";
cout << " 실패한 접두사: " << progress["failed_prefixes"].size() << "개\n";
cout << "\n저장된 지점부터 재개하시겠습니까? (y/N): ";
string ans; getline(cin, ans);
if (to_lower_copy(ans) != "y") {
return collect_massive_wordlist(mode, batch_size, false);
}
vector<string> all_prefixes = generate_prefixes(mode);
unordered_set<string> completed_prefixes, failed_prefixes;
for (auto& p : progress["completed_prefixes"]) completed_prefixes.insert(p.get<string>());
for (auto& p : progress["failed_prefixes"]) failed_prefixes.insert(p.get<string>());
vector<WordData> all_words;
for (auto& w : progress["collected_words"]) {
all_words.push_back(WordData{
w["word"].get<string>(),
w["prefix"].get<string>(),
w["collected_at"].get<string>(),
w["length"].get<int>()
});
}
vector<string> remaining;
remaining.reserve(all_prefixes.size());
for (auto& p : all_prefixes) if (!completed_prefixes.count(p)) remaining.push_back(p);
cout << "\n재개 정보:\n";
cout << " 남은 접두사: " << remaining.size() << "개\n";
cout << " 예상 추가 단어: " << (remaining.size() * (size_t)batch_size) << "개\n";
cout << string(50, '-') << "\n";
return process_prefixes(remaining, all_words, completed_prefixes, failed_prefixes,
batch_size, mode, all_prefixes.size());
}
string collect_massive_wordlist(const string& mode, int batch_size, bool resume=true) {
if (resume && fs::exists(progress_file)) {
return resume_collection(mode, batch_size);
}
cout << "=== 새로운 대용량 영어 단어 수집 시작 ===\n";
cout << "모드: " << mode << "\n";
cout << "배치 크기: " << batch_size << "\n";
cout << "모델: " << model << "\n";
cout << string(50, '-') << "\n";
auto prefixes = generate_prefixes(mode);
cout << "총 접두사 수: " << prefixes.size() << "\n";
cout << "예상 최대 단어 수: " << (size_t)prefixes.size() * (size_t)batch_size << "\n";
vector<WordData> all_words;
unordered_set<string> completed_prefixes, failed_prefixes;
return process_prefixes(prefixes, all_words, completed_prefixes, failed_prefixes,
batch_size, mode, prefixes.size());
}
};
struct Args {
int stage = 1;
string model = "gpt-oss:120b";
string mode = "2letter"; // or "adaptive"
int batch = 35;
bool resume = false;
bool clean = false;
int min_len = 3;
int max_len = 20;
string host = "localhost";
int port = 11434;
};
static void print_help() {
cout << "사용법: robust_dict [옵션]\n"
<< " --stage <int> (지원: 1)\n"
<< " --model <name> (기본: gpt-oss:120b)\n"
<< " --mode <2letter|adaptive>(기본: 2letter)\n"
<< " --batch <int> (기본: 35)\n"
<< " --resume (중단 지점부터 재개)\n"
<< " --clean (진행 파일 삭제 후 새로 시작)\n"
<< " --min-length <int> (기본: 3)\n"
<< " --max-length <int> (기본: 20)\n"
<< " --host <str> (기본: localhost)\n"
<< " --port <int> (기본: 11434)\n";
}
static bool parse_args(int argc, char** argv, Args& a) {
for (int i=1; i<argc; ++i) {
string k = argv[i];
auto need = [&](int left){ return i+left < argc; };
if (k == "--help" || k == "-h") { print_help(); exit(0); }
else if (k == "--stage" && need(1)) { a.stage = stoi(argv[++i]); }
else if (k == "--model" && need(1)) { a.model = argv[++i]; }
else if (k == "--mode" && need(1)) { a.mode = argv[++i]; }
else if (k == "--batch" && need(1)) { a.batch = stoi(argv[++i]); }
else if (k == "--resume") { a.resume = true; }
else if (k == "--clean") { a.clean = true; }
else if (k == "--min-length" && need(1)) { a.min_len = stoi(argv[++i]); }
else if (k == "--max-length" && need(1)) { a.max_len = stoi(argv[++i]); }
else if (k == "--host" && need(1)) { a.host = argv[++i]; }
else if (k == "--port" && need(1)) { a.port = stoi(argv[++i]); }
else {
cerr << "알 수 없는 옵션: " << k << "\n";
print_help();
return false;
}
}
return true;
}
int main(int argc, char** argv) {
// 시그널 핸들러
signal(SIGINT, signal_handler);
#ifdef SIGTERM
signal(SIGTERM, signal_handler);
#endif
Args args;
if (!parse_args(argc, argv, args)) return 1;
if (args.stage != 1) {
cerr << "현재는 stage 1만 지원합니다.\n";
return 1;
}
if (args.clean && fs::exists("collection_progress.json")) {
fs::remove("collection_progress.json");
cout << "진행 상황 파일을 삭제했습니다. 새로 시작합니다.\n";
}
curl_global_init(CURL_GLOBAL_DEFAULT);
RobustDictionaryGenerator gen(args.model, args.host, args.port);
gen.min_word_length = args.min_len;
gen.max_word_length = args.max_len;
try {
string result_file = gen.collect_massive_wordlist(args.mode, args.batch, !args.clean);
if (!result_file.empty()) {
cout << "\n다음 단계는 기존 011.py를 사용하세요:\n";
cout << "python3 011.py --stage 2 --file " << result_file << "\n";
}
} catch (const std::exception& e) {
cerr << "프로그램 오류: " << e.what() << "\n";
curl_global_cleanup();
return 1;
}
curl_global_cleanup();
return 0;
}
필요시, Windows에선 vcpkg 등으로 curl과 nlohmann-json을 설치한 뒤 동일하게 빌드
답글 남기기