2#ifdef USE_RUNTIME_DETECTION
10 std::string model_path_;
12 int num_threads_ = -1;
13 int num_GPU_layers_ = 0;
14 bool flash_attention_ =
false;
15 int context_size_ = 4096;
16 int batch_size_ = 2048;
17 bool embedding_only_ =
false;
18 std::vector<std::string> lora_paths_ = {};
37 num_GPU_layers_ = val;
42 flash_attention_ = val;
57 embedding_only_ = val;
Runtime loading and management of LLM libraries.
LLM service implementation with server capabilities.
< LLM service implementation
Runtime loader for LLM libraries.