3#if !(TARGET_OS_IOS || TARGET_OS_VISION)
4std::atomic_flag sigint_terminating = ATOMIC_FLAG_INIT;
6void llm_sigint_signal_handler(
int sig)
8 if (sigint_terminating.test_and_set())
12 fprintf(stderr,
"Received second interrupt, terminating immediately.\n");
29 static std::once_flag initialized;
30 std::call_once(initialized, [](){
32#if !(TARGET_OS_IOS || TARGET_OS_VISION)
33 register_sigint_hook(llm_sigint_signal_handler);
46std::string
LLM::LLM_args_to_command(
const std::string &model_path,
int num_slots,
int num_threads,
int num_GPU_layers,
bool flash_attention,
int context_size,
int batch_size,
bool embedding_only,
const std::vector<std::string> &lora_paths)
48 std::string command =
"-m \"" + model_path +
"\"" +
49 " -t " + std::to_string(num_threads) +
50 " -np " + std::to_string(num_slots) +
51 " -c " + std::to_string(context_size) +
52 " -b " + std::to_string(batch_size);
54 if (num_GPU_layers > 0)
55 command +=
" -ngl " + std::to_string(num_GPU_layers);
56 command +=
" --context-shift";
58 command += flash_attention ?
"on" :
"off";
60 command +=
" --embedding";
61 for (
const auto &lora_path : lora_paths)
62 command +=
" --lora \"" + lora_path +
"\"";
68 std::istringstream iss(command);
69 std::vector<std::string> args;
75 args.push_back(token);
78 for (
size_t i = 0; i < args.size(); ++i)
80 const std::string &arg = args[i];
83 if (arg ==
"-ngl" || arg ==
"--gpu-layers" || arg ==
"--n-gpu-layers")
85 if (i + 1 < args.size())
89 int val = std::stoi(args[i + 1]);
100 size_t eqPos = arg.find(
'=');
101 if (eqPos != std::string::npos)
103 std::string key = arg.substr(0, eqPos);
104 std::string value = arg.substr(eqPos + 1);
106 if (key ==
"-ngl" || key ==
"--gpu-layers" || key ==
"--n-gpu-layers")
110 int val = std::stoi(value);
129 j[
"messages"] = messages;
137 return result.at(
"prompt").get<std::string>();
139 catch (
const std::exception &)
155 j[
"content"] = query;
163 return result.at(
"tokens").get<std::vector<int>>();
165 catch (
const std::exception &)
182 j[
"tokens"] = tokens;
190 return result.at(
"content").get<std::string>();
192 catch (
const std::exception &)
208 j[
"content"] = query;
216 const json& emb = result.at(0).at(
"embedding");
217 if (emb.is_array() && !emb.empty())
219 if (emb[0].is_number())
return emb.get<std::vector<float>>();
220 if (emb[0].is_array())
return emb.at(0).get<std::vector<float>>();
223 catch (
const std::exception &)
239 j[
"prompt"] = prompt;
240 j[
"id_slot"] = id_slot;
247 j[
"json_schema"] = json::parse(
grammar);
249 catch (
const json::parse_error &)
259 j[it.key()] = it.value();
269 if (result.contains(
"error")) {
270 json error = result.at(
"error");
271 int code = error.at(
"code").get<
int>();
272 std::string message = error.at(
"message").get<std::string>();
276 return result.at(
"content").get<std::string>();
278 catch (
const std::exception &)
284std::string
LLM::completion(
const std::string &prompt, CharArrayFn callback,
int id_slot,
bool return_response_json)
290 if (return_response_json)
300 j[
"id_slot"] = id_slot;
301 j[
"action"] = action;
302 j[
"filepath"] = filepath;
310 return result.at(
"filename").get<std::string>();
312 catch (
const std::exception &)
318std::string
LLMLocal::slot(
int id_slot,
const std::string &action,
const std::string &filepath)
334 json j = json::array();
335 for (
const auto &lora : loras)
337 j.push_back({{
"id", lora.id},
338 {
"scale", lora.scale}});
347 return result.at(
"success").get<
bool>();
349 catch (
const std::exception &)
364 json j = json::array();
365 for (
const auto &lora : loras)
367 j.push_back({{
"id", lora.id},
368 {
"scale", lora.scale},
369 {
"path", lora.path}});
376 std::vector<LoraIdScalePath> loras;
379 for (
const auto &lora : result)
381 loras.push_back({lora[
"id"].get<
int>(),
382 lora[
"scale"].get<float>(),
383 lora[
"path"].get<std::string>()});
386 catch (
const std::exception &)
410 inst->debug(debug_level);
420 inst->logging_callback(callback);
430const bool IsDebuggerAttached(
void)
433 return ::IsDebuggerPresent();
435 return AmIBeingDebugged();
437 return debuggerIsAttached();
447 return stringToCharArray(result.dump());
452 return stringToCharArray(llm->
detokenize(json::parse(tokens_as_json)));
458 return stringToCharArray(result.dump());
461const char *
LLM_Completion(
LLM *llm,
const char *prompt, CharArrayFn callback,
int id_slot,
bool return_response_json)
463 return stringToCharArray(llm->
completion(prompt, callback, id_slot, return_response_json));
468 json params = json::parse(params_json ? params_json :
"{}");
484 return stringToCharArray(llm->
grammar);
489 return stringToCharArray(llm->
apply_template(json::parse(messages_as_json)));
499 return stringToCharArray(llm->
save_slot(id_slot, filepath));
504 return stringToCharArray(llm->
load_slot(id_slot, filepath));
516 json loras_arr = json::array();
517 loras_arr = json::parse(loras_as_json);
518 std::vector<LoraIdScale> loras;
519 for (
const auto &lora : loras_arr)
521 loras.push_back({lora[
"id"].get<
int>(), lora[
"scale"].get<float>()});
525 catch (
const std::exception &)
533 std::vector<LoraIdScalePath> loras = llm->
lora_list();
534 json j = json::array();
535 for (
const auto &lora : loras)
537 j.push_back({{
"id", lora.id},
538 {
"scale", lora.scale}});
540 return stringToCharArray(j.dump());
588 llm->
set_SSL(SSL_cert, SSL_key);
593 return get_status_code();
598 std::string result = get_status_message();
599 return stringToCharArray(result);
Core LLM functionality interface and base classes.
void ensure_error_handlers_initialized()
Ensures error handlers are properly initialized.
Abstract class for local LLM operations with slot management.
virtual std::string slot_json(const json &data)=0
Manage slots with HTTP response support.
virtual std::string load_slot(int id_slot, const std::string &filepath)
Load slot state from file.
virtual std::string save_slot(int id_slot, const std::string &filepath)
Save slot state to file.
virtual std::string slot(int id_slot, const std::string &action, const std::string &filepath)
Perform slot operation.
virtual void cancel(int id_slot)=0
Cancel request.
virtual json build_slot_json(int id_slot, const std::string &action, const std::string &filepath)
Build JSON for slot operations.
virtual std::string parse_slot_json(const json &result)
Parse slot operation result.
Registry for managing LLM provider instances.
std::vector< LLMProvider * > get_instances()
Get all registered provider instances.
void set_debug_level(int level)
Set global debug level.
static bool initialised
Whether the registry has been initialized.
void set_log_callback(CharArrayFn callback)
Set global log callback.
static LLMProviderRegistry & instance()
Get the singleton registry instance.
Abstract class for LLM service providers.
virtual void logging_callback(CharArrayFn callback)=0
Set logging callback function.
virtual bool started()=0
Check if service is started.
virtual void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="")=0
Start HTTP server.
virtual void logging_stop()
Stop logging.
virtual void join_service()=0
Wait for service thread to complete.
virtual void stop_server()=0
Stop HTTP server.
virtual bool parse_lora_weight_json(const json &result)
Parse LoRA weight configuration result.
virtual void join_server()=0
Wait for server thread to complete.
virtual void enable_reasoning(bool reasoning)
enable reasoning
virtual json build_lora_weight_json(const std::vector< LoraIdScale > &loras)
Build JSON for LoRA weight configuration.
virtual bool lora_weight(const std::vector< LoraIdScale > &loras)
Configure LoRA weights.
virtual std::vector< LoraIdScalePath > lora_list()
List available LoRA adapters.
virtual void stop()=0
Stop the LLM service.
virtual void set_SSL(const std::string &SSL_cert, const std::string &SSL_key)=0
Configure SSL certificates.
virtual std::string lora_list_json()=0
List available LoRA adapters.
virtual ~LLMProvider()
Virtual destructor.
virtual std::string lora_weight_json(const json &data)=0
Configure LoRA weights with HTTP response support.
virtual void start()=0
Start the LLM service.
virtual std::vector< LoraIdScalePath > parse_lora_list_json(const json &result)
Parse LoRA list result.
virtual json build_lora_list_json(const std::vector< LoraIdScalePath > &loras)
Build JSON for LoRA list result.
virtual int embedding_size()=0
Get embedding vector size.
Abstract base class for Large Language Model operations.
virtual json build_tokenize_json(const std::string &query)
Build JSON for tokenization.
virtual std::string embeddings_json(const json &data)=0
Generate embeddings with HTTP response support.
int32_t n_keep
Number of tokens to keep from the beginning of the context.
virtual json build_apply_template_json(const json &messages)
Build JSON for template application.
virtual std::string parse_apply_template_json(const json &result)
Parse template application result.
virtual std::vector< int > parse_tokenize_json(const json &result)
Parse tokenization result.
virtual std::string apply_template(const json &messages)
Apply template to messages.
virtual json build_detokenize_json(const std::vector< int32_t > &tokens)
Build JSON for detokenization.
virtual std::string parse_completion_json(const json &result)
Parse completion result.
virtual std::string apply_template_json(const json &data)=0
Apply a chat template to message data.
virtual json build_completion_json(const std::string &prompt, int id_slot=-1)
Build JSON for completion generation.
virtual std::string tokenize_json(const json &data)=0
Tokenize input (override)
virtual void set_completion_params(json completion_params_)
Set completion parameters.
virtual std::string detokenize(const std::vector< int32_t > &tokens)
Convert tokens to text.
json completion_params
JSON object containing completion parameters.
virtual std::vector< int > tokenize(const std::string &query)
Tokenize text.
virtual std::string completion(const std::string &prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false)
Generate completion.
virtual void set_grammar(std::string grammar_)
Set grammar for constrained generation.
static std::string LLM_args_to_command(const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={})
Convert LLM parameters to command line arguments.
virtual std::vector< float > embeddings(const std::string &query)
Generate embeddings.
static bool has_gpu_layers(const std::string &command)
Check if command line arguments specify GPU layers.
virtual std::string completion_json(const json &data, CharArrayFn callback, bool callbackWithJSON)=0
Generate text completion.
std::string grammar
Grammar specification in GBNF format or JSON schema.
virtual std::string detokenize_json(const json &data)=0
Convert tokens back to text.
virtual std::vector< float > parse_embeddings_json(const json &result)
Parse embeddings result.
virtual json build_embeddings_json(const std::string &query)
Build JSON for embeddings generation.
virtual std::string parse_detokenize_json(const json &result)
Parse detokenization result.
const char * LLM_Lora_List(LLMProvider *llm)
List LoRA adapters (C API)
const char * LLM_Get_Grammar(LLM *llm)
Get grammar (C API)
void LLM_Stop(LLMProvider *llm)
Stop LLM service (C API)
void LLM_Enable_Reasoning(LLMProvider *llm, bool enable_reasoning)
Enable reasoning (C API)
const char * LLM_Save_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Save slot state (C API)
void LLM_Logging_Callback(CharArrayFn callback)
Set global logging callback (C API)
const char * LLM_Load_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Load slot state (C API)
void LLM_Join_Service(LLMProvider *llm)
Wait for service to complete (C API)
void LLM_Set_SSL(LLMProvider *llm, const char *SSL_cert, const char *SSL_key)
Set SSL configuration (C API)
bool Has_GPU_Layers(const char *command)
Check if command has GPU layers (C API)
const char * LLM_Status_Message()
Get last operation status message (C API)
void LLM_Set_Grammar(LLM *llm, const char *grammar="")
Set grammar (C API)
const char * LLM_Apply_Template(LLM *llm, const char *messages_as_json)
Apply chat template (C API)
void LLM_Cancel(LLMLocal *llm, int id_slot)
Cancel request (C API)
void LLM_Logging_Stop()
Stop global logging (C API)
void LLM_Start(LLMProvider *llm)
Start LLM service (C API)
void LLM_Set_Completion_Parameters(LLM *llm, const char *params_json="{}")
Set completion parameters (C API)
const int LLM_Embedding_Size(LLMProvider *llm)
Get embedding vector size (C API)
void LLM_Delete(LLMProvider *llm)
Delete LLM provider (C API)
void LLM_Debug(int debug_level)
Set global debug level (C API)
bool LLM_Lora_Weight(LLMProvider *llm, const char *loras_as_json)
Configure LoRA weights (C API)
const char * LLM_Completion(LLM *llm, const char *prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false)
Generate completion (C API)
const char * LLM_Tokenize(LLM *llm, const char *query)
Tokenize text (C API)
void LLM_Join_Server(LLMProvider *llm)
Wait for server to complete (C API)
const bool LLM_Started(LLMProvider *llm)
Check if service is started (C API)
const char * LLM_Get_Completion_Parameters(LLM *llm)
Get completion parameters (C API)
const char * LLM_Embeddings(LLM *llm, const char *query)
Generate embeddings (C API)
const int LLM_Status_Code()
Get last operation status code (C API)
void LLM_Stop_Server(LLMProvider *llm)
Stop HTTP server (C API)
const char * LLM_Detokenize(LLM *llm, const char *tokens_as_json)
Detokenize tokens (C API)
void LLM_Start_Server(LLMProvider *llm, const char *host="0.0.0.0", int port=-1, const char *API_key="")
Start HTTP server (C API)