LlamaLib  v2.0.5
Cross-platform library for local LLMs
Loading...
Searching...
No Matches
LLM_service.cpp
1#include "LLM_service.h"
2
3#include "arg.h"
4#include "common.h"
5#include "llama-chat.h"
6#include "log.h"
7
8#ifndef SERVER_H
9#define SERVER_H
10#include "server-context.cpp"
11#endif // SERVER_H
12
13//============================= LLMService IMPLEMENTATION =============================//
14
16
17LLMService::LLMService(const std::string &model_path, int num_slots, int num_threads, int num_GPU_layers, bool flash_attention, int context_size, int batch_size, bool embedding_only, const std::vector<std::string> &lora_paths)
18{
19 init(LLM::LLM_args_to_command(model_path, num_slots, num_threads, num_GPU_layers, flash_attention, context_size, batch_size, embedding_only, lora_paths));
20}
21
22LLMService *LLMService::from_params(const json &params_json)
23{
24 std::vector<char *> argv = LLMService::jsonToArguments(params_json);
25 LLMService *llmService = new LLMService();
26 llmService->init(argv.size(), argv.data());
27 return llmService;
28}
29
30LLMService *LLMService::from_command(const std::string &command)
31{
32 LLMService *llmService = new LLMService();
33 llmService->init(command);
34 return llmService;
35}
36
37LLMService *LLMService::from_command(int argc, char **argv)
38{
39 LLMService *llmService = new LLMService();
40 llmService->init(argc, argv);
41 return llmService;
42}
43
45{
46 if (ctx_server != nullptr)
47 {
48 if (ctx_http != nullptr)
49 {
51 stop();
52 }
53 delete ctx_server;
54 ctx_server = nullptr;
55 }
56}
57
58std::vector<char *> LLMService::jsonToArguments(const json &params_json)
59{
60 common_params default_params;
61 common_params_context ctx = common_params_parser_init(default_params, LLAMA_EXAMPLE_SERVER);
62
63 std::vector<std::string> args_str = {"llm"};
64 std::set<std::string> used_keys;
65
66 for (const auto &opt : ctx.options)
67 {
68 for (const auto &name : opt.args)
69 {
70 std::string key = name;
71 if (key.rfind("--", 0) == 0)
72 key = key.substr(2); // strip leading "--"
73 else if (key.rfind("-", 0) == 0)
74 continue; // skip short options
75
76 std::string json_key = key;
77 std::replace(json_key.begin(), json_key.end(), '-', '_');
78
79 if (params_json.contains(json_key))
80 continue;
81
82 used_keys.insert(json_key);
83 const auto &value = params_json[json_key];
84 args_str.push_back(name);
85
86 if (opt.handler_void != nullptr)
87 {
88 break;
89 }
90 else if (opt.handler_string != nullptr || opt.handler_int != nullptr)
91 {
92 args_str.push_back(value.is_string() ? value.get<std::string>() : value.dump());
93 break;
94 }
95 else if (opt.handler_str_str != nullptr)
96 {
97 if (!value.is_array() || value.size() != 2)
98 {
99 std::string err = "Expected array of 2 values for: " + json_key;
100 LOG_WRN("%s\n", err.c_str());
101 continue;
102 }
103 args_str.push_back(value[0].is_string() ? value[0].get<std::string>() : value[0].dump());
104 args_str.push_back(value[1].is_string() ? value[1].get<std::string>() : value[1].dump());
105 break;
106 }
107 }
108 }
109
110 // Report unused keys
111 for (const auto &[key, _] : params_json.items())
112 {
113 if (used_keys.find(key) == used_keys.end())
114 {
115 std::string err = "Unused parameter in JSON: " + key;
116 LOG_WRN("%s\n", err.c_str());
117 }
118 }
119
120 // Convert to argv
121 std::vector<std::unique_ptr<char[]>> argv_storage;
122 std::vector<char *> argv;
123 for (const auto &arg : args_str)
124 {
125 auto buf = std::make_unique<char[]>(arg.size() + 1);
126 std::memcpy(buf.get(), arg.c_str(), arg.size() + 1);
127 argv.push_back(buf.get());
128 argv_storage.push_back(std::move(buf));
129 }
130
131 return argv;
132}
133
134std::vector<std::string> LLMService::splitArguments(const std::string &inputString)
135{
136 std::vector<std::string> arguments;
137
138 unsigned counter = 0;
139 std::string segment;
140 std::istringstream stream_input(inputString);
141 while (std::getline(stream_input, segment, '"'))
142 {
143 ++counter;
144 if (counter % 2 == 0)
145 {
146 if (!segment.empty())
147 arguments.push_back(segment);
148 }
149 else
150 {
151 std::istringstream stream_segment(segment);
152 while (std::getline(stream_segment, segment, ' '))
153 if (!segment.empty())
154 arguments.push_back(segment);
155 }
156 }
157 return arguments;
158}
159
160void LLMService::init(const std::string &params_string)
161{
162 std::vector<std::string> arguments = splitArguments("llm " + params_string);
163
164 // Convert vector of strings to argc and argv
165 int argc = static_cast<int>(arguments.size());
166 char **argv = new char *[argc];
167 for (int i = 0; i < argc; ++i)
168 {
169 argv[i] = new char[arguments[i].size() + 1];
170 std::strcpy(argv[i], arguments[i].c_str());
171 }
172 init(argc, argv);
173}
174
175void LLMService::init(const char *params_string)
176{
177 init(std::string(params_string));
178}
179
180void LLMService::init(int argc, char **argv)
181{
183 if (setjmp(get_jump_point()) != 0)
184 return;
185 try
186 {
187 command = args_to_command(argc, argv);
188
190 registry.register_instance(this);
191 debug(registry.get_debug_level());
193
194 ctx_server = new server_context();
195 // ctx_server->impl->batch = {0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
196
197 params = new common_params();
198 params->port = 0;
199 params->verbosity = common_log_verbosity_thold;
200 if (!common_params_parse(argc, argv, *params, LLAMA_EXAMPLE_SERVER))
201 {
202 throw std::runtime_error("Invalid parameters!");
203 }
204
205 // validate batch size for embeddings
206 // embeddings require all tokens to be processed in a single ubatch
207 // see https://github.com/ggml-org/llama.cpp/issues/12836
208 if (params->embedding && params->n_batch > params->n_ubatch) {
209 LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params->n_batch, params->n_ubatch);
210 LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params->n_ubatch);
211 params->n_batch = params->n_ubatch;
212 }
213
214 if (params->n_parallel < 0) {
215 LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
216
217 params->n_parallel = 4;
218 params->kv_unified = true;
219 }
220
221 // for consistency between server router mode and single-model mode, we set the same model name as alias
222 if (params->model_alias.empty() && !params->model.name.empty()) {
223 params->model_alias.insert(params->model.name);
224 }
225
226 common_init();
227
228 llama_backend_init();
229 llama_backend_has_init = true;
230 llama_numa_init(params->numa);
231
232 LLAMALIB_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params->cpuparams.n_threads, params->cpuparams_batch.n_threads, std::thread::hardware_concurrency());
233
234 // load the model
235 params->use_jinja = true;
236 if (!ctx_server->load_model(*params))
237 {
238 throw std::runtime_error("Error loading the model!");
239 }
241 LLAMALIB_INF("model loaded\n");
242
243 ctx_http = new server_http_context();
244 routes = new server_routes(*params, *ctx_server);
245 routes->update_meta(*ctx_server);
246
247 // params->chat_template = detect_chat_template();
248 // LOG_INF("chat_template: %s\n", params->chat_template.c_str());
249
250 ctx_server->impl->queue_tasks.on_new_task([this](server_task && task)
251 { this->ctx_server->impl->process_single_task(std::move(task)); });
252 ctx_server->impl->queue_tasks.on_update_slots([this]()
253 { this->ctx_server->impl->update_slots(); });
254 }
255 catch (...)
256 {
258 handle_exception(-1);
259 }
260}
261
262void LLMService::enable_reasoning(bool reasoning) {
264 if (ctx_server != nullptr) ctx_server->impl->chat_params.enable_thinking = reasoning_enabled;
265}
266
267// const std::string LLMService::detect_chat_template()
268// {
269// const char *chat_template_jinja = common_chat_templates_source(ctx_server->impl->chat_templates.get());
270// int chat_template_value = llm_chat_detect_template(chat_template_jinja);
271// std::vector<const char *> supported_tmpl;
272// int res = llama_chat_builtin_templates(nullptr, 0);
273// if (res > 0)
274// {
275// supported_tmpl.resize(res);
276// llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
277// for (const auto &key : supported_tmpl)
278// {
279// llm_chat_template val = llm_chat_template_from_str(key);
280// if ((int)val == chat_template_value)
281// {
282// return key;
283// break;
284// }
285// }
286// }
287// return "";
288// }
289
290void LLMService::debug(int debug_level)
291{
292 common_log_set_verbosity_thold(debug_level - 2);
293}
294
295void LLMService::logging_callback(CharArrayFn callback)
296{
297 log_callback = callback;
298}
299
300void release_slot(server_slot &slot)
301{
302 if (slot.task && slot.task->type == SERVER_TASK_TYPE_COMPLETION)
303 {
304 slot.i_batch = -1;
305 slot.task->params.n_predict = 0;
306 slot.stop = STOP_TYPE_LIMIT;
307 slot.has_next_token = false;
308 }
309 else
310 {
311 slot.release();
312 }
313}
314
316{
317 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
318 return -1;
319 if (ctx_server->impl->slots.size() == 0)
320 return -1;
321 return next_available_slot++ % ctx_server->impl->slots.size();
322}
323
325{
326 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
327 return -1;
328 return ctx_server->impl->get_slot_n_ctx();
329}
330
331// wrapper function that handles exceptions and logs errors
332// this is to make sure handler_t never throws exceptions; instead, it returns an error response
333static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
334 return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
335 std::string message;
336 error_type error;
337 try {
338 return func(req);
339 } catch (const std::invalid_argument & e) {
340 // treat invalid_argument as invalid request (400)
341 error = ERROR_TYPE_INVALID_REQUEST;
342 message = e.what();
343 } catch (const std::exception & e) {
344 // treat other exceptions as server error (500)
345 error = ERROR_TYPE_SERVER;
346 message = e.what();
347 } catch (...) {
348 error = ERROR_TYPE_SERVER;
349 message = "unknown error";
350 }
351
352 auto res = std::make_unique<server_http_res>();
353 res->status = 500;
354 try {
355 json error_data = format_error_response(message, error);
356 res->status = json_value(error_data, "code", 500);
357 res->data = safe_json_to_str({{ "error", error_data }});
358 SRV_WRN("got exception: %s\n", res->data.c_str());
359 } catch (const std::exception & e) {
360 SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str());
361 res->data = "Internal Server Error";
362 }
363 return res;
364 };
365}
366
367void LLMService::start_server(const std::string &host, int port, const std::string &API_key)
368{
369 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
370 return;
371
372 try
373 {
374 params->hostname = host.empty() ? "0.0.0.0" : host;
375 if (port >= 0)
376 params->port = port;
377 params->api_keys.clear();
378 if (!API_key.empty())
379 params->api_keys.push_back(API_key);
380
381 std::lock_guard<std::mutex> lock(start_stop_mutex);
382
383 if (!ctx_http->init(*params)) {
384 throw std::runtime_error("Failed to initialize HTTP server!");
385 }
386
387 // register API routes
388 ctx_http->post("/health", ex_wrapper(routes->get_health)); // public endpoint (no API key check)
389 ctx_http->post("/v1/health", ex_wrapper(routes->get_health)); // public endpoint (no API key check)
390 ctx_http->post("/props", ex_wrapper([this](const server_http_req &) {return get_props();}));
391 ctx_http->post("/completion", ex_wrapper(routes->post_completions)); // legacy
392 ctx_http->post("/completions", ex_wrapper(routes->post_completions));
393 ctx_http->post("/chat/completions", ex_wrapper(routes->post_chat_completions));
394 ctx_http->post("/v1/chat/completions", ex_wrapper(routes->post_chat_completions));
395 ctx_http->post("/tokenize", ex_wrapper(routes->post_tokenize));
396 ctx_http->post("/detokenize", ex_wrapper(routes->post_detokenize));
397 ctx_http->post("/apply-template", ex_wrapper(routes->post_apply_template));
398 ctx_http->post("/embedding", ex_wrapper(routes->post_embeddings)); // legacy
399 ctx_http->post("/embeddings", ex_wrapper(routes->post_embeddings));
400
401
402 // start the HTTP server before loading the model to be able to serve /health requests
403 if (!ctx_http->start()) {
404 stop();
405 throw std::runtime_error("Exiting due to HTTP server error\n");
406 }
407
408 ctx_http->is_ready.store(true);
409 }
410 catch (...)
411 {
412 handle_exception();
413 }
414}
415
416
418{
419 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
420 return;
421 if (ctx_http == nullptr)
422 return;
423 std::lock_guard<std::mutex> lock(start_stop_mutex);
424 LLAMALIB_INF("stopping server\n");
425 ctx_http->stop();
426 if (ctx_http->thread.joinable()) ctx_http->thread.join();
427 server_stopped = true;
428 server_stopped_cv.notify_all();
429 LLAMALIB_INF("stopped server\n");
430}
431
433{
434 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
435 return;
436 std::unique_lock<std::mutex> lock(start_stop_mutex);
437 server_stopped_cv.wait(lock, [this]
438 { return server_stopped; });
439}
440
442{
443 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
444 return;
445 std::lock_guard<std::mutex> lock(start_stop_mutex);
446 service_thread = std::thread([&]()
447 {
448 LLAMALIB_INF("starting service\n");
449 ctx_server->impl->queue_tasks.start_loop();
450 LLAMALIB_INF("stopped service loop\n");
451 return 1; });
452 while (!started())
453 {
454 std::this_thread::sleep_for(std::chrono::milliseconds(1));
455 }
456}
457
458void LLMService::stop()
459{
460 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
461 return;
462 try
463 {
464 std::lock_guard<std::mutex> lock(start_stop_mutex);
465 if (!started())
466 return;
467 LLAMALIB_INF("shutting down tasks\n");
468
469 // hack completion slots to think task is completed
470 for (server_slot &slot : ctx_server->impl->slots)
471 {
472 release_slot(slot);
473 }
474
475 if((!ctx_server->impl->queue_tasks.is_empty()))
476 {
477 LLAMALIB_INF("Wait until tasks have finished\n");
478 int grace = 20;
479 while (!ctx_server->impl->queue_tasks.is_empty() && grace-- > 0)
480 {
481 std::this_thread::sleep_for(std::chrono::milliseconds(50));
482 }
483 LLAMALIB_INF("Tasks have finished\n");
484 }
485
486 ctx_http->stop();
487 ctx_server->terminate();
488
489 if (llama_backend_has_init)
490 llama_backend_free();
491
492 if (service_thread.joinable())
493 {
494 service_thread.join();
495 }
496 service_stopped = true;
497 service_stopped_cv.notify_all();
498 LLAMALIB_INF("service stopped\n");
499
501 }
502 catch (...)
503 {
504 handle_exception();
505 }
506}
507
509{
510 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
511 return;
512 std::unique_lock<std::mutex> lock(start_stop_mutex);
513 service_stopped_cv.wait(lock, [this]
514 { return service_stopped; });
515}
516
518{
519 return ctx_server != nullptr && ctx_server->impl->queue_tasks.is_running();
520}
521
522void LLMService::set_SSL(const std::string &SSL_cert_str, const std::string &SSL_key_str)
523{
524 params->ssl_cert = SSL_cert_str;
525 params->ssl_key = SSL_key_str;
526}
527
528std::string LLMService::encapsulate_route(const json &body, server_http_context::handler_t route_handler)
529{
530 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
531 return "";
532
533 try
534 {
535 server_http_req req{ {}, {}, "", "", body.dump(), always_false };
536 return route_handler(req)->data;
537 }
538 catch (...)
539 {
540 handle_exception();
541 }
542 return "";
543}
544
545std::string LLMService::apply_template_json(const json &body)
546{
547 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
548 return "";
549 std::vector<raw_buffer> files; // dummy, unused
550 json copy = body;
551 json data = oaicompat_chat_params_parse(
552 copy,
553 ctx_server->impl->chat_params,
554 files);
555 return safe_json_to_str({{"prompt", std::move(data.at("prompt"))}});
556}
557
558std::string LLMService::tokenize_json(const json &body)
559{
560 return encapsulate_route(body, routes->post_tokenize);
561}
562
563std::string LLMService::detokenize_json(const json &body)
564{
565 return encapsulate_route(body, routes->post_detokenize);
566}
567
568std::string LLMService::embeddings_json(const json &body)
569{
570 return encapsulate_route(body, routes->post_embeddings);
571};
572
573std::string LLMService::lora_weight_json(const json &body)
574{
575 return safe_json_to_str(encapsulate_route(body, routes->post_lora_adapters));
576};
577
578std::string LLMService::lora_list_json()
579{
580 return encapsulate_route({}, routes->get_lora_adapters);
581}
582
583std::string LLMService::completion_json(const json &data_in, CharArrayFn callback, bool callbackWithJSON)
584{
585 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
586 return "";
587
588 try
589 {
590 bool stream = json_value(data_in, "stream", callback != nullptr);
591 json data = data_in;
592 data["stream"] = stream;
593
594 server_http_req req{ {}, {}, "", "", data.dump(), always_false };
595 auto result = routes->post_completions(req);
596 if (result->status != 200)
597 {
598 return result->data;
599 }
600
601 if (stream)
602 {
603 ResponseConcatenator concatenator;
604 if (callback) concatenator.set_callback(callback, callbackWithJSON);
605 while (!concatenator.is_complete()) {
606 std::string chunk;
607 bool has_next = result->next(chunk);
608 if (!chunk.empty()) {
609 if (!concatenator.process_chunk(chunk)) break;
610 }
611 if (!has_next) break;
612 }
613 return concatenator.get_result_json();
614 } else {
615 return result->data;
616 }
617 }
618 catch (...)
619 {
620 handle_exception();
621 }
622 return "";
623}
624
625std::string LLMService::slot_json(const json &data)
626{
627 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
628 return "";
629 std::string result_data = "";
630 try
631 {
632 server_task_type task_type;
633 std::string action = data.at("action");
634 if (action == "save")
635 {
636 task_type = SERVER_TASK_TYPE_SLOT_SAVE;
637 }
638 else if (action == "restore")
639 {
640 task_type = SERVER_TASK_TYPE_SLOT_RESTORE;
641 }
642 else if (action == "erase")
643 {
644 task_type = SERVER_TASK_TYPE_SLOT_ERASE;
645 }
646 else
647 {
648 throw std::runtime_error("Invalid action" + action);
649 }
650
651 int id_slot = json_value(data, "id_slot", 0);
652
653 server_task task(task_type);
654 task.id = ctx_server->impl->queue_tasks.get_new_id();
655 task.slot_action.id_slot = id_slot;
656
657 if (action == "save" || action == "restore")
658 {
659 std::string filepath = data.at("filepath");
660 task.slot_action.filename = filepath.substr(filepath.find_last_of("/\\") + 1);
661 task.slot_action.filepath = filepath;
662 }
663
664 ctx_server->impl->queue_results.add_waiting_task_id(task.id);
665 ctx_server->impl->queue_tasks.post(std::move(task));
666
667 server_task_result_ptr result = ctx_server->impl->queue_results.recv(task.id);
668 ctx_server->impl->queue_results.remove_waiting_task_id(task.id);
669
670 json result_json = result->to_json();
671 result_data = result_json.dump();
672 }
673 catch (...)
674 {
675 handle_exception();
676 }
677 return result_data;
678}
679
680void LLMService::cancel(int id_slot)
681{
682 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
683 return;
684 try
685 {
686 for (auto &slot : ctx_server->impl->slots)
687 {
688 if (slot.id == id_slot)
689 {
690 release_slot(slot);
691 break;
692 }
693 }
694 }
695 catch (...)
696 {
697 handle_exception();
698 }
699}
700
702{
703 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
704 return 0;
705
706 int n_embd = 0;
707 if (ctx_server == nullptr) return 0;
708 return ctx_server->get_meta().model_n_embd_inp;
709}
710
711std::unique_ptr<server_http_res> LLMService::get_props(){
712 if (get_status_code() < 0 || setjmp(get_jump_point()) != 0)
713 return nullptr;
714
715 server_http_req req{ {}, {}, "", "", "", always_false };
716 auto result = routes->get_props(req);
717
718 json data = json::parse(result->data);
719 int n_ctx = -1;
720 try
721 {
722 n_ctx = data.at("default_generation_settings").at("n_ctx").get<int>();
723 }
724 catch (...){}
725
726 result->data = safe_json_to_str(json {
727 { "default_generation_settings", {
728 { "n_ctx", n_ctx }
729 }}
730 });
731 return result;
732};
733
734
735//=========================== API ===========================//
736
738{
739 LLMProviderRegistry::inject_registry(existing_instance);
740}
741
742bool LLMService_Supports_GPU()
743{
744 return llama_supports_gpu_offload();
745}
746
747LLMService *LLMService_Construct(const char *model_path, int num_slots, int num_threads, int num_GPU_layers, bool flash_attention, int context_size, int batch_size, bool embedding_only, int lora_count, const char **lora_paths)
748{
749 std::vector<std::string> lora_paths_vector;
750 if (lora_paths != nullptr && lora_count > 0)
751 {
752 for (int i = 0; i < lora_count; ++i)
753 {
754 lora_paths_vector.push_back(std::string(lora_paths[i]));
755 }
756 }
757 LLMService* llmService = new LLMService(model_path, num_slots, num_threads, num_GPU_layers, flash_attention, context_size, batch_size, embedding_only, lora_paths_vector);
758 if (get_status_code() != 0)
759 {
760 if (llmService != nullptr) delete llmService;
761 return nullptr;
762 }
763 return llmService;
764}
765
766LLMService *LLMService_From_Command(const char *params_string_arr)
767{
768 LLMService* llmService;
769 std::string params_string(params_string_arr);
770 try
771 {
772 json j = json::parse(params_string);
773 llmService = LLMService::from_params(j);
774 }
775 catch (const json::parse_error &)
776 {
777 llmService = LLMService::from_command(params_string);
778 }
779
780 if (get_status_code() != 0)
781 {
782 if (llmService != nullptr) delete llmService;
783 return nullptr;
784 }
785 return llmService;
786}
787
788const char *LLMService_Command(LLMService *llm_service)
789{
790 return stringToCharArray(llm_service->get_command());
791}
792
793void LLMService_InjectErrorState(ErrorState *error_state)
794{
796}
void ensure_error_handlers_initialized()
Ensures error handlers are properly initialized.
Definition LLM.cpp:25
LLM service implementation with server capabilities.
#define LLAMALIB_INF(...)
Info-level logging macro for LLama library.
Definition LLM_service.h:16
static void inject_error_state(ErrorState *state)
Inject a custom error state instance.
virtual std::string slot(int id_slot, const std::string &action, const std::string &filepath)
Perform slot operation.
Definition LLM.cpp:318
Registry for managing LLM provider instances.
Definition LLM.h:384
void unregister_instance(LLMProvider *service)
Unregister an LLM provider instance.
Definition LLM.h:421
const int get_debug_level()
Get current debug level.
Definition LLM.h:445
const CharArrayFn get_log_callback()
Get current log callback.
Definition LLM.h:459
void register_instance(LLMProvider *service)
Register an LLM provider instance.
Definition LLM.h:412
static LLMProviderRegistry & instance()
Get the singleton registry instance.
Definition LLM.h:399
static void inject_registry(LLMProviderRegistry *instance)
Inject a custom registry instance.
Definition LLM.h:391
bool reasoning_enabled
Whether reasoning is enabled.
Definition LLM.h:357
virtual void enable_reasoning(bool reasoning)
enable reasoning
Definition LLM.h:305
Runtime loader for LLM libraries.
Definition LLM_runtime.h:64
void init(int argc, char **argv)
Initialize from argc/argv parameters.
void stop_server() override
Stop HTTP server (override - delegates to loaded library)
void set_SSL(const std::string &cert, const std::string &key) override
Set SSL configuration (override - delegates to loaded library)
~LLMService()
Destructor.
void enable_reasoning(bool reasoning) override
enable reasoning
std::string lora_weight_json(const json &data) override
Configure LoRA weights with HTTP response support.
static std::vector< char * > jsonToArguments(const json &params_json)
Convert JSON parameters to command line arguments.
void join_service() override
Wait for service completion (override - delegates to loaded library)
void cancel(int id_slot) override
Cancel request (override - delegates to loaded library)
bool started() override
Check service status (override - delegates to loaded library)
void start() override
Start service (override - delegates to loaded library)
std::string lora_list_json() override
List available LoRA adapters.
void logging_callback(CharArrayFn callback) override
Set logging callback (override - delegates to loaded library)
std::string tokenize_json(const json &data) override
Tokenize input (override)
std::unique_ptr< server_http_res > get_props()
Return properties of server / slots.
LLMService()
Default constructor.
std::string slot_json(const json &data) override
Manage slots with HTTP response support.
std::string detokenize_json(const json &data) override
Convert tokens back to text.
std::string embeddings_json(const json &data) override
Generate embeddings with HTTP response support.
int get_next_available_slot() override
Get available slot (override - delegates to loaded library)
void debug(int debug_level) override
Set debug level (override - delegates to loaded library)
void join_server() override
Wait for server completion (override - delegates to loaded library)
static LLMService * from_params(const json &params_json)
Create LLMService from JSON parameters.
std::string apply_template_json(const json &data) override
Apply a chat template to message data.
void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="") override
Start HTTP server (override - delegates to loaded library)
std::string completion_json(const json &data, CharArrayFn callback=nullptr, bool callbackWithJSON=true) override
Generate completion (override - delegates to loaded library)
std::string get_command()
Returns the construct command.
Definition LLM_service.h:97
int get_slot_context_size() override
Get slot context size (override - delegates to loaded library)
void stop() override
Stop service (override - delegates to loaded library)
int embedding_size() override
Get embedding size (override - delegates to loaded library)
static LLMService * from_command(const std::string &command)
Create runtime from command line string.
static std::string LLM_args_to_command(const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={})
Convert LLM parameters to command line arguments.
Definition LLM.cpp:46
Handles concatenation of LLM response chunks (both streaming and non-streaming) Accumulates content a...
bool process_chunk(const std::string &chunk_data)
Process a single chunk and accumulate its content/tokens.
bool is_complete() const
Check if response is complete.
std::string get_result_json() const
Get the complete result as JSON string.
void set_callback(CharArrayFn callback, bool callWithJSON=false)
Set a callback to be invoked after each chunk is processed.
LLMService * LLMService_Construct(const char *model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, int lora_count=0, const char **lora_paths=nullptr)
Construct LLMService instance (C API)
void LLMService_Registry(LLMProviderRegistry *existing_instance)
Set registry for LLMService (C API)
const char * LLMService_Command(LLMService *llm_service)
Returns the construct command (C API)
LLMService * LLMService_From_Command(const char *params_string)
Create LLMService from command string (C API)
Error state container for sharing between libraries.