LlamaLib  v2.0.2
Cross-platform library for local LLMs
Loading...
Searching...
No Matches
LLM_service.cpp
1#include "LLM_service.h"
2
3#include "arg.h"
4#include "common.h"
5#include "llama-chat.h"
6#include "log.h"
7
8#ifndef SERVER_H
9#define SERVER_H
10#include "server-context.cpp"
11#endif // SERVER_H
12
13//============================= LLMService IMPLEMENTATION =============================//
14
16
17LLMService::LLMService(const std::string &model_path, int num_slots, int num_threads, int num_GPU_layers, bool flash_attention, int context_size, int batch_size, bool embedding_only, const std::vector<std::string> &lora_paths)
18{
19 init(LLM::LLM_args_to_command(model_path, num_slots, num_threads, num_GPU_layers, flash_attention, context_size, batch_size, embedding_only, lora_paths));
20}
21
22LLMService *LLMService::from_params(const json &params_json)
23{
24 std::vector<char *> argv = LLMService::jsonToArguments(params_json);
25 LLMService *llmService = new LLMService();
26 llmService->init(argv.size(), argv.data());
27 return llmService;
28}
29
30LLMService *LLMService::from_command(const std::string &command)
31{
32 LLMService *llmService = new LLMService();
33 llmService->init(command);
34 return llmService;
35}
36
37LLMService *LLMService::from_command(int argc, char **argv)
38{
39 LLMService *llmService = new LLMService();
40 llmService->init(argc, argv);
41 return llmService;
42}
43
45{
46 if (ctx_server != nullptr)
47 {
48 if (ctx_http != nullptr)
49 {
51 stop();
52 }
53 delete ctx_server;
54 ctx_server = nullptr;
55 }
56}
57
58std::vector<char *> LLMService::jsonToArguments(const json &params_json)
59{
60 common_params default_params;
61 common_params_context ctx = common_params_parser_init(default_params, LLAMA_EXAMPLE_SERVER);
62
63 std::vector<std::string> args_str = {"llm"};
64 std::set<std::string> used_keys;
65
66 for (const auto &opt : ctx.options)
67 {
68 for (const auto &name : opt.args)
69 {
70 std::string key = name;
71 if (key.rfind("--", 0) == 0)
72 key = key.substr(2); // strip leading "--"
73 else if (key.rfind("-", 0) == 0)
74 continue; // skip short options
75
76 std::string json_key = key;
77 std::replace(json_key.begin(), json_key.end(), '-', '_');
78
79 if (params_json.contains(json_key))
80 continue;
81
82 used_keys.insert(json_key);
83 const auto &value = params_json[json_key];
84 args_str.push_back(name);
85
86 if (opt.handler_void != nullptr)
87 {
88 break;
89 }
90 else if (opt.handler_string != nullptr || opt.handler_int != nullptr)
91 {
92 args_str.push_back(value.is_string() ? value.get<std::string>() : value.dump());
93 break;
94 }
95 else if (opt.handler_str_str != nullptr)
96 {
97 if (!value.is_array() || value.size() != 2)
98 {
99 std::string err = "Expected array of 2 values for: " + json_key;
100 LOG_WRN("%s\n", err.c_str());
101 continue;
102 }
103 args_str.push_back(value[0].is_string() ? value[0].get<std::string>() : value[0].dump());
104 args_str.push_back(value[1].is_string() ? value[1].get<std::string>() : value[1].dump());
105 break;
106 }
107 }
108 }
109
110 // Report unused keys
111 for (const auto &[key, _] : params_json.items())
112 {
113 if (used_keys.find(key) == used_keys.end())
114 {
115 std::string err = "Unused parameter in JSON: " + key;
116 LOG_WRN("%s\n", err.c_str());
117 }
118 }
119
120 // Convert to argv
121 std::vector<std::unique_ptr<char[]>> argv_storage;
122 std::vector<char *> argv;
123 for (const auto &arg : args_str)
124 {
125 auto buf = std::make_unique<char[]>(arg.size() + 1);
126 std::memcpy(buf.get(), arg.c_str(), arg.size() + 1);
127 argv.push_back(buf.get());
128 argv_storage.push_back(std::move(buf));
129 }
130
131 return argv;
132}
133
134std::vector<std::string> LLMService::splitArguments(const std::string &inputString)
135{
136 std::vector<std::string> arguments;
137
138 unsigned counter = 0;
139 std::string segment;
140 std::istringstream stream_input(inputString);
141 while (std::getline(stream_input, segment, '"'))
142 {
143 ++counter;
144 if (counter % 2 == 0)
145 {
146 if (!segment.empty())
147 arguments.push_back(segment);
148 }
149 else
150 {
151 std::istringstream stream_segment(segment);
152 while (std::getline(stream_segment, segment, ' '))
153 if (!segment.empty())
154 arguments.push_back(segment);
155 }
156 }
157 return arguments;
158}
159
160void LLMService::init(const std::string &params_string)
161{
162 std::vector<std::string> arguments = splitArguments("llm " + params_string);
163
164 // Convert vector of strings to argc and argv
165 int argc = static_cast<int>(arguments.size());
166 char **argv = new char *[argc];
167 for (int i = 0; i < argc; ++i)
168 {
169 argv[i] = new char[arguments[i].size() + 1];
170 std::strcpy(argv[i], arguments[i].c_str());
171 }
172 init(argc, argv);
173}
174
175void LLMService::init(const char *params_string)
176{
177 init(std::string(params_string));
178}
179
180void LLMService::init(int argc, char **argv)
181{
183 if (setjmp(get_jump_point()) != 0)
184 return;
185 try
186 {
187 command = args_to_command(argc, argv);
188
190 registry.register_instance(this);
191 debug(registry.get_debug_level());
193
194 ctx_server = new server_context();
195 // ctx_server->impl->batch = {0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
196
197 params = new common_params();
198 params->port = 0;
199 params->verbosity = common_log_verbosity_thold;
200 if (!common_params_parse(argc, argv, *params, LLAMA_EXAMPLE_SERVER))
201 {
202 throw std::runtime_error("Invalid parameters!");
203 }
204
205 // validate batch size for embeddings
206 // embeddings require all tokens to be processed in a single ubatch
207 // see https://github.com/ggml-org/llama.cpp/issues/12836
208 if (params->embedding && params->n_batch > params->n_ubatch) {
209 LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params->n_batch, params->n_ubatch);
210 LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params->n_ubatch);
211 params->n_batch = params->n_ubatch;
212 }
213
214 if (params->n_parallel < 0) {
215 LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
216
217 params->n_parallel = 4;
218 params->kv_unified = true;
219 }
220
221 // for consistency between server router mode and single-model mode, we set the same model name as alias
222 if (params->model_alias.empty() && !params->model.name.empty()) {
223 params->model_alias = params->model.name;
224 }
225
226 common_init();
227
228 llama_backend_init();
229 llama_backend_has_init = true;
230 llama_numa_init(params->numa);
231
232 LLAMALIB_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params->cpuparams.n_threads, params->cpuparams_batch.n_threads, std::thread::hardware_concurrency());
233
234 // load the model
235 params->use_jinja = true;
236 if (!ctx_server->load_model(*params))
237 {
238 throw std::runtime_error("Error loading the model!");
239 }
241 LLAMALIB_INF("model loaded\n");
242
243 ctx_http = new server_http_context();
244 routes = new server_routes(*params, *ctx_server);
245 routes->update_meta(*ctx_server);
246
247 params->chat_template = detect_chat_template();
248 LOG_INF("chat_template: %s\n", params->chat_template.c_str());
249
250 ctx_server->impl->queue_tasks.on_new_task([this](server_task && task)
251 { this->ctx_server->impl->process_single_task(std::move(task)); });
252 ctx_server->impl->queue_tasks.on_update_slots([this]()
253 { this->ctx_server->impl->update_slots(); });
254 }
255 catch (...)
256 {
258 handle_exception(1);
259 }
260}
261
262void LLMService::enable_reasoning(bool reasoning) {
264 if (ctx_server != nullptr) ctx_server->impl->oai_parser_opt.enable_thinking = reasoning_enabled;
265}
266
267const std::string LLMService::detect_chat_template()
268{
269 const char *chat_template_jinja = common_chat_templates_source(ctx_server->impl->chat_templates.get());
270 int chat_template_value = llm_chat_detect_template(chat_template_jinja);
271 std::vector<const char *> supported_tmpl;
272 int res = llama_chat_builtin_templates(nullptr, 0);
273 if (res > 0)
274 {
275 supported_tmpl.resize(res);
276 llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
277 for (const auto &key : supported_tmpl)
278 {
279 llm_chat_template val = llm_chat_template_from_str(key);
280 if ((int)val == chat_template_value)
281 {
282 return key;
283 break;
284 }
285 }
286 }
287 return "";
288}
289
290void LLMService::debug(int debug_level)
291{
292 common_log_set_verbosity_thold(debug_level - 2);
293}
294
295void LLMService::logging_callback(CharArrayFn callback)
296{
297 log_callback = callback;
298}
299
300void release_slot(server_slot &slot)
301{
302 if (slot.task && slot.task->type == SERVER_TASK_TYPE_COMPLETION)
303 {
304 slot.i_batch = -1;
305 slot.task->params.n_predict = 0;
306 slot.stop = STOP_TYPE_LIMIT;
307 slot.has_next_token = false;
308 }
309 else
310 {
311 slot.release();
312 }
313}
314
316{
317 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
318 return -1;
319 if (ctx_server->impl->slots.size() == 0)
320 return -1;
321 return next_available_slot++ % ctx_server->impl->slots.size();
322}
323
324// wrapper function that handles exceptions and logs errors
325// this is to make sure handler_t never throws exceptions; instead, it returns an error response
326static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
327 return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
328 std::string message;
329 error_type error;
330 try {
331 return func(req);
332 } catch (const std::invalid_argument & e) {
333 // treat invalid_argument as invalid request (400)
334 error = ERROR_TYPE_INVALID_REQUEST;
335 message = e.what();
336 } catch (const std::exception & e) {
337 // treat other exceptions as server error (500)
338 error = ERROR_TYPE_SERVER;
339 message = e.what();
340 } catch (...) {
341 error = ERROR_TYPE_SERVER;
342 message = "unknown error";
343 }
344
345 auto res = std::make_unique<server_http_res>();
346 res->status = 500;
347 try {
348 json error_data = format_error_response(message, error);
349 res->status = json_value(error_data, "code", 500);
350 res->data = safe_json_to_str({{ "error", error_data }});
351 SRV_WRN("got exception: %s\n", res->data.c_str());
352 } catch (const std::exception & e) {
353 SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str());
354 res->data = "Internal Server Error";
355 }
356 return res;
357 };
358}
359
360void LLMService::start_server(const std::string &host, int port, const std::string &API_key)
361{
362 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
363 return;
364
365 try
366 {
367 params->hostname = host.empty() ? "0.0.0.0" : host;
368 if (port >= 0)
369 params->port = port;
370 params->api_keys.clear();
371 if (!API_key.empty())
372 params->api_keys.push_back(API_key);
373
374 std::lock_guard<std::mutex> lock(start_stop_mutex);
375
376 if (!ctx_http->init(*params)) {
377 throw std::runtime_error("Failed to initialize HTTP server!");
378 }
379
380 // register API routes
381 ctx_http->post("/health", ex_wrapper(routes->get_health)); // public endpoint (no API key check)
382 ctx_http->post("/v1/health", ex_wrapper(routes->get_health)); // public endpoint (no API key check)
383 ctx_http->post("/completion", ex_wrapper(routes->post_completions)); // legacy
384 ctx_http->post("/completions", ex_wrapper(routes->post_completions));
385 ctx_http->post("/chat/completions", ex_wrapper(routes->post_chat_completions));
386 ctx_http->post("/v1/chat/completions", ex_wrapper(routes->post_chat_completions));
387 ctx_http->post("/tokenize", ex_wrapper(routes->post_tokenize));
388 ctx_http->post("/detokenize", ex_wrapper(routes->post_detokenize));
389 ctx_http->post("/apply-template", ex_wrapper(routes->post_apply_template));
390 ctx_http->post("/embedding", ex_wrapper(routes->post_embeddings)); // legacy
391 ctx_http->post("/embeddings", ex_wrapper(routes->post_embeddings));
392
393
394 // start the HTTP server before loading the model to be able to serve /health requests
395 if (!ctx_http->start()) {
396 stop();
397 throw std::runtime_error("Exiting due to HTTP server error\n");
398 }
399
400 ctx_http->is_ready.store(true);
401 }
402 catch (...)
403 {
404 handle_exception();
405 }
406}
407
408
410{
411 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
412 return;
413 if (ctx_http == nullptr)
414 return;
415 std::lock_guard<std::mutex> lock(start_stop_mutex);
416 LLAMALIB_INF("stopping server\n");
417 ctx_http->stop();
418 if (ctx_http->thread.joinable()) ctx_http->thread.join();
419 server_stopped = true;
420 server_stopped_cv.notify_all();
421 LLAMALIB_INF("stopped server\n");
422}
423
425{
426 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
427 return;
428 std::unique_lock<std::mutex> lock(start_stop_mutex);
429 server_stopped_cv.wait(lock, [this]
430 { return server_stopped; });
431}
432
434{
435 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
436 return;
437 std::lock_guard<std::mutex> lock(start_stop_mutex);
438 service_thread = std::thread([&]()
439 {
440 LLAMALIB_INF("starting service\n");
441 ctx_server->impl->queue_tasks.start_loop();
442 LLAMALIB_INF("stopped service loop\n");
443 return 1; });
444 while (!started())
445 {
446 std::this_thread::sleep_for(std::chrono::milliseconds(1));
447 }
448}
449
450void LLMService::stop()
451{
452 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
453 return;
454 try
455 {
456 std::lock_guard<std::mutex> lock(start_stop_mutex);
457 if (!started())
458 return;
459 LLAMALIB_INF("shutting down tasks\n");
460
461 // hack completion slots to think task is completed
462 for (server_slot &slot : ctx_server->impl->slots)
463 {
464 release_slot(slot);
465 }
466
467 if((!ctx_server->impl->queue_tasks.is_empty()))
468 {
469 LLAMALIB_INF("Wait until tasks have finished\n");
470 int grace = 20;
471 while (!ctx_server->impl->queue_tasks.is_empty() && grace-- > 0)
472 {
473 std::this_thread::sleep_for(std::chrono::milliseconds(50));
474 }
475 LLAMALIB_INF("Tasks have finished\n");
476 }
477
478 ctx_http->stop();
479 ctx_server->terminate();
480
481 if (llama_backend_has_init)
482 llama_backend_free();
483
484 if (service_thread.joinable())
485 {
486 service_thread.join();
487 }
488 service_stopped = true;
489 service_stopped_cv.notify_all();
490 LLAMALIB_INF("service stopped\n");
491
493 }
494 catch (...)
495 {
496 handle_exception();
497 }
498}
499
501{
502 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
503 return;
504 std::unique_lock<std::mutex> lock(start_stop_mutex);
505 service_stopped_cv.wait(lock, [this]
506 { return service_stopped; });
507}
508
510{
511 return ctx_server != nullptr && ctx_server->impl->queue_tasks.is_running();
512}
513
514void LLMService::set_SSL(const std::string &SSL_cert_str, const std::string &SSL_key_str)
515{
516 params->ssl_cert = SSL_cert_str;
517 params->ssl_key = SSL_key_str;
518}
519
520std::string LLMService::encapsulate_route(const json &body, server_http_context::handler_t route_handler)
521{
522 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
523 return "";
524
525 try
526 {
527 server_http_req req{ {}, {}, "", body.dump(), always_false };
528 return route_handler(req)->data;
529 }
530 catch (...)
531 {
532 handle_exception();
533 }
534 return "";
535}
536
537std::string LLMService::apply_template_json(const json &body)
538{
539 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
540 return "";
541 std::vector<raw_buffer> files; // dummy, unused
542 json copy = body;
543 json data = oaicompat_chat_params_parse(
544 copy,
545 ctx_server->impl->oai_parser_opt,
546 files);
547 return safe_json_to_str({{"prompt", std::move(data.at("prompt"))}});
548}
549
550std::string LLMService::tokenize_json(const json &body)
551{
552 return encapsulate_route(body, routes->post_tokenize);
553}
554
555std::string LLMService::detokenize_json(const json &body)
556{
557 return encapsulate_route(body, routes->post_detokenize);
558}
559
560std::string LLMService::embeddings_json(const json &body)
561{
562 return encapsulate_route(body, routes->post_embeddings);
563};
564
565std::string LLMService::lora_weight_json(const json &body)
566{
567 return safe_json_to_str(encapsulate_route(body, routes->post_lora_adapters));
568};
569
570std::string LLMService::lora_list_json()
571{
572 return encapsulate_route({}, routes->get_lora_adapters);
573}
574
575std::string LLMService::completion_json(const json &data_in, CharArrayFn callback, bool callbackWithJSON)
576{
577 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
578 return "";
579
580 try
581 {
582 bool stream = json_value(data_in, "stream", callback != nullptr);
583 json data = data_in;
584 data["stream"] = stream;
585
586 server_http_req req{ {}, {}, "", data.dump(), always_false };
587 auto result = routes->post_completions(req);
588 if (result->status != 200)
589 {
590 return result->data;
591 }
592
593 if (stream)
594 {
595 ResponseConcatenator concatenator;
596 if (callback) concatenator.set_callback(callback, callbackWithJSON);
597 while (!concatenator.is_complete()) {
598 std::string chunk;
599 bool has_next = result->next(chunk);
600 if (!chunk.empty()) {
601 if (!concatenator.process_chunk(chunk)) break;
602 }
603 if (!has_next) break;
604 }
605 return concatenator.get_result_json();
606 } else {
607 return result->data;
608 }
609 }
610 catch (...)
611 {
612 handle_exception();
613 }
614 return "";
615}
616
617std::string LLMService::slot_json(const json &data)
618{
619 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
620 return "";
621 std::string result_data = "";
622 try
623 {
624 server_task_type task_type;
625 std::string action = data.at("action");
626 if (action == "save")
627 {
628 task_type = SERVER_TASK_TYPE_SLOT_SAVE;
629 }
630 else if (action == "restore")
631 {
632 task_type = SERVER_TASK_TYPE_SLOT_RESTORE;
633 }
634 else if (action == "erase")
635 {
636 task_type = SERVER_TASK_TYPE_SLOT_ERASE;
637 }
638 else
639 {
640 throw std::runtime_error("Invalid action" + action);
641 }
642
643 int id_slot = json_value(data, "id_slot", 0);
644
645 server_task task(task_type);
646 task.id = ctx_server->impl->queue_tasks.get_new_id();
647 task.slot_action.slot_id = id_slot;
648
649 if (action == "save" || action == "restore")
650 {
651 std::string filepath = data.at("filepath");
652 task.slot_action.filename = filepath.substr(filepath.find_last_of("/\\") + 1);
653 task.slot_action.filepath = filepath;
654 }
655
656 ctx_server->impl->queue_results.add_waiting_task_id(task.id);
657 ctx_server->impl->queue_tasks.post(std::move(task));
658
659 server_task_result_ptr result = ctx_server->impl->queue_results.recv(task.id);
660 ctx_server->impl->queue_results.remove_waiting_task_id(task.id);
661
662 json result_json = result->to_json();
663 result_data = result_json.dump();
664 }
665 catch (...)
666 {
667 handle_exception();
668 }
669 return result_data;
670}
671
672void LLMService::cancel(int id_slot)
673{
674 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
675 return;
676 try
677 {
678 for (auto &slot : ctx_server->impl->slots)
679 {
680 if (slot.id == id_slot)
681 {
682 release_slot(slot);
683 break;
684 }
685 }
686 }
687 catch (...)
688 {
689 handle_exception();
690 }
691}
692
694{
695 if (get_status_code() > 0 || setjmp(get_jump_point()) != 0)
696 return 0;
697
698 int n_embd = 0;
699 if (ctx_server == nullptr) return 0;
700 return ctx_server->get_meta().model_n_embd_inp;
701}
702
703//=========================== API ===========================//
704
706{
707 LLMProviderRegistry::inject_registry(existing_instance);
708}
709
710LLMService *LLMService_Construct(const char *model_path, int num_slots, int num_threads, int num_GPU_layers, bool flash_attention, int context_size, int batch_size, bool embedding_only, int lora_count, const char **lora_paths)
711{
712 std::vector<std::string> lora_paths_vector;
713 if (lora_paths != nullptr && lora_count > 0)
714 {
715 for (int i = 0; i < lora_count; ++i)
716 {
717 lora_paths_vector.push_back(std::string(lora_paths[i]));
718 }
719 }
720 LLMService* llmService = new LLMService(model_path, num_slots, num_threads, num_GPU_layers, flash_attention, context_size, batch_size, embedding_only, lora_paths_vector);
721 if (get_status_code() != 0)
722 {
723 if (llmService != nullptr) delete llmService;
724 return nullptr;
725 }
726 return llmService;
727}
728
729LLMService *LLMService_From_Command(const char *params_string_arr)
730{
731 LLMService* llmService;
732 std::string params_string(params_string_arr);
733 try
734 {
735 json j = json::parse(params_string);
736 llmService = LLMService::from_params(j);
737 }
738 catch (const json::parse_error &)
739 {
740 llmService = LLMService::from_command(params_string);
741 }
742
743 if (get_status_code() != 0)
744 {
745 if (llmService != nullptr) delete llmService;
746 return nullptr;
747 }
748 return llmService;
749}
750
751const char *LLMService_Command(LLMService *llm_service)
752{
753 return stringToCharArray(llm_service->get_command());
754}
755
756void LLMService_InjectErrorState(ErrorState *error_state)
757{
759}
void ensure_error_handlers_initialized()
Ensures error handlers are properly initialized.
Definition LLM.cpp:25
LLM service implementation with server capabilities.
#define LLAMALIB_INF(...)
Info-level logging macro for LLama library.
Definition LLM_service.h:16
static void inject_error_state(ErrorState *state)
Inject a custom error state instance.
virtual std::string slot(int id_slot, const std::string &action, const std::string &filepath)
Perform slot operation.
Definition LLM.cpp:317
Registry for managing LLM provider instances.
Definition LLM.h:380
void unregister_instance(LLMProvider *service)
Unregister an LLM provider instance.
Definition LLM.h:417
const int get_debug_level()
Get current debug level.
Definition LLM.h:441
const CharArrayFn get_log_callback()
Get current log callback.
Definition LLM.h:455
void register_instance(LLMProvider *service)
Register an LLM provider instance.
Definition LLM.h:408
static LLMProviderRegistry & instance()
Get the singleton registry instance.
Definition LLM.h:395
static void inject_registry(LLMProviderRegistry *instance)
Inject a custom registry instance.
Definition LLM.h:387
bool reasoning_enabled
Whether reasoning is enabled.
Definition LLM.h:353
virtual void enable_reasoning(bool reasoning)
enable reasoning
Definition LLM.h:301
Runtime loader for LLM libraries.
Definition LLM_runtime.h:63
void init(int argc, char **argv)
Initialize from argc/argv parameters.
void stop_server() override
Stop HTTP server (override - delegates to loaded library)
void set_SSL(const std::string &cert, const std::string &key) override
Set SSL configuration (override - delegates to loaded library)
~LLMService()
Destructor.
void enable_reasoning(bool reasoning) override
enable reasoning
std::string lora_weight_json(const json &data) override
Configure LoRA weights with HTTP response support.
static std::vector< char * > jsonToArguments(const json &params_json)
Convert JSON parameters to command line arguments.
void join_service() override
Wait for service completion (override - delegates to loaded library)
void cancel(int id_slot) override
Cancel request (override - delegates to loaded library)
bool started() override
Check service status (override - delegates to loaded library)
void start() override
Start service (override - delegates to loaded library)
std::string lora_list_json() override
List available LoRA adapters.
void logging_callback(CharArrayFn callback) override
Set logging callback (override - delegates to loaded library)
std::string tokenize_json(const json &data) override
Tokenize input (override)
LLMService()
Default constructor.
std::string slot_json(const json &data) override
Manage slots with HTTP response support.
std::string detokenize_json(const json &data) override
Convert tokens back to text.
std::string embeddings_json(const json &data) override
Generate embeddings with HTTP response support.
int get_next_available_slot() override
Get available slot (override - delegates to loaded library)
void debug(int debug_level) override
Set debug level (override - delegates to loaded library)
void join_server() override
Wait for server completion (override - delegates to loaded library)
static LLMService * from_params(const json &params_json)
Create LLMService from JSON parameters.
std::string apply_template_json(const json &data) override
Apply a chat template to message data.
void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="") override
Start HTTP server (override - delegates to loaded library)
std::string completion_json(const json &data, CharArrayFn callback=nullptr, bool callbackWithJSON=true) override
Generate completion (override - delegates to loaded library)
std::string get_command()
Returns the construct command.
Definition LLM_service.h:97
void stop() override
Stop service (override - delegates to loaded library)
int embedding_size() override
Get embedding size (override - delegates to loaded library)
static LLMService * from_command(const std::string &command)
Create runtime from command line string.
static std::string LLM_args_to_command(const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={})
Convert LLM parameters to command line arguments.
Definition LLM.cpp:46
Handles concatenation of LLM response chunks (both streaming and non-streaming) Accumulates content a...
bool process_chunk(const std::string &chunk_data)
Process a single chunk and accumulate its content/tokens.
bool is_complete() const
Check if response is complete.
std::string get_result_json() const
Get the complete result as JSON string.
void set_callback(CharArrayFn callback, bool callWithJSON=false)
Set a callback to be invoked after each chunk is processed.
LLMService * LLMService_Construct(const char *model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, int lora_count=0, const char **lora_paths=nullptr)
Construct LLMService instance (C API)
void LLMService_Registry(LLMProviderRegistry *existing_instance)
Set registry for LLMService (C API)
const char * LLMService_Command(LLMService *llm_service)
Returns the construct command (C API)
LLMService * LLMService_From_Command(const char *params_string)
Create LLMService from command string (C API)
Error state container for sharing between libraries.