10#include "error_handling.h"
13#include <TargetConditionals.h>
59class UNDREAMAI_API
LLM
63 std::string grammar =
"";
72 virtual std::vector<int> tokenize(
const std::string &query);
82 virtual std::string detokenize(
const std::vector<int32_t> &tokens);
93 virtual std::vector<float> embeddings(
const std::string &query);
117 virtual std::string completion(
const std::string &prompt, CharArrayFn callback =
nullptr,
int id_slot = -1,
bool return_response_json =
false);
125 virtual std::string
completion_json(
const json &data, CharArrayFn callback,
bool callbackWithJSON) = 0;
130 virtual void set_grammar(std::string grammar_) { grammar = grammar_; }
139 virtual std::string apply_template(
const json &messages);
150 static bool has_gpu_layers(
const std::string &command);
163 static std::string LLM_args_to_command(
const std::string &model_path,
int num_slots = 1,
int num_threads = -1,
int num_GPU_layers = 0,
bool flash_attention =
false,
int context_size = 4096,
int batch_size = 2048,
bool embedding_only =
false,
const std::vector<std::string> &lora_paths = {});
169 virtual json build_apply_template_json(
const json &messages);
174 virtual std::string parse_apply_template_json(
const json &result);
179 virtual json build_tokenize_json(
const std::string &query);
184 virtual std::vector<int> parse_tokenize_json(
const json &result);
189 virtual json build_detokenize_json(
const std::vector<int32_t> &tokens);
194 virtual std::string parse_detokenize_json(
const json &result);
199 virtual json build_embeddings_json(
const std::string &query);
204 virtual std::vector<float> parse_embeddings_json(
const json &result);
210 virtual json build_completion_json(
const std::string &prompt,
int id_slot = -1);
215 virtual std::string parse_completion_json(
const json &result);
232 virtual std::string
save_slot(
int id_slot,
const std::string &filepath) {
return slot(id_slot,
"save", filepath); }
238 virtual std::string
load_slot(
int id_slot,
const std::string &filepath) {
return slot(id_slot,
"restore", filepath); }
256 virtual std::string slot(
int id_slot,
const std::string &action,
const std::string &filepath);
263 virtual json build_slot_json(
int id_slot,
const std::string &action,
const std::string &filepath);
268 virtual std::string parse_slot_json(
const json &result);
283 virtual bool lora_weight(
const std::vector<LoraIdScale> &loras);
293 virtual std::vector<LoraIdScalePath> lora_list();
305 virtual void debug(
int debug_level) = 0;
312 virtual void logging_stop();
328 virtual void start_server(
const std::string &host =
"0.0.0.0",
int port = -1,
const std::string &API_key =
"") = 0;
342 virtual void set_SSL(
const std::string &SSL_cert,
const std::string &SSL_key) = 0;
353 bool reasoning_enabled =
false;
358 virtual bool parse_lora_weight_json(
const json &result);
363 virtual json build_lora_weight_json(
const std::vector<LoraIdScale> &loras);
368 virtual std::vector<LoraIdScalePath> parse_lora_list_json(
const json &result);
373 virtual json build_lora_list_json(
const std::vector<LoraIdScalePath> &loras);
397 if (custom_instance_)
398 return *custom_instance_;
410 std::lock_guard<std::mutex> lock(mutex_);
411 instances_.push_back(service);
419 std::lock_guard<std::mutex> lock(mutex_);
420 instances_.erase(std::remove(instances_.begin(), instances_.end(), service), instances_.end());
428 std::lock_guard<std::mutex> lock(mutex_);
436 debug_level_ = level;
450 log_callback_ = callback;
457 return log_callback_;
464 std::vector<LLMProvider *> instances_;
465 int debug_level_ = 0;
466 CharArrayFn log_callback_ =
nullptr;
490 UNDREAMAI_API
void LLM_Debug(
int debug_level);
502 UNDREAMAI_API
const bool IsDebuggerAttached(
void);
556 UNDREAMAI_API
const char *
LLM_Completion(
LLM *llm,
const char *prompt, CharArrayFn callback =
nullptr,
int id_slot = -1,
bool return_response_json =
false);
void ensure_error_handlers_initialized()
Ensures error handlers are properly initialized.
Abstract class for local LLM operations with slot management.
virtual std::string slot_json(const json &data)=0
Manage slots with HTTP response support.
virtual std::string load_slot(int id_slot, const std::string &filepath)
Load slot state from file.
virtual int get_next_available_slot()=0
Get an available processing slot.
virtual std::string save_slot(int id_slot, const std::string &filepath)
Save slot state to file.
virtual void cancel(int id_slot)=0
Cancel request.
Registry for managing LLM provider instances.
void unregister_instance(LLMProvider *service)
Unregister an LLM provider instance.
std::vector< LLMProvider * > get_instances()
Get all registered provider instances.
void set_debug_level(int level)
Set global debug level.
const int get_debug_level()
Get current debug level.
static bool initialised
Whether the registry has been initialized.
void set_log_callback(CharArrayFn callback)
Set global log callback.
const CharArrayFn get_log_callback()
Get current log callback.
void register_instance(LLMProvider *service)
Register an LLM provider instance.
static LLMProviderRegistry & instance()
Get the singleton registry instance.
static void inject_registry(LLMProviderRegistry *instance)
Inject a custom registry instance.
Abstract class for LLM service providers.
virtual void logging_callback(CharArrayFn callback)=0
Set logging callback function.
virtual bool started()=0
Check if service is started.
virtual void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="")=0
Start HTTP server.
virtual std::string debug_implementation()=0
Implementation debugging.
virtual void join_service()=0
Wait for service thread to complete.
virtual void stop_server()=0
Stop HTTP server.
virtual void debug(int debug_level)=0
Set debug level.
virtual void join_server()=0
Wait for server thread to complete.
virtual void enable_reasoning(bool reasoning)
enable reasoning
virtual void stop()=0
Stop the LLM service.
virtual void set_SSL(const std::string &SSL_cert, const std::string &SSL_key)=0
Configure SSL certificates.
virtual std::string lora_list_json()=0
List available LoRA adapters.
virtual std::string lora_weight_json(const json &data)=0
Configure LoRA weights with HTTP response support.
virtual void start()=0
Start the LLM service.
virtual int embedding_size()=0
Get embedding vector size.
Abstract base class for Large Language Model operations.
virtual std::string embeddings_json(const json &data)=0
Generate embeddings with HTTP response support.
virtual std::string get_completion_params()
Get current completion parameters.
virtual std::string apply_template_json(const json &data)=0
Apply a chat template to message data.
virtual std::string tokenize_json(const json &data)=0
Tokenize input (override)
virtual void set_completion_params(json completion_params_)
Set completion parameters.
json completion_params
JSON object containing completion parameters.
virtual void set_grammar(std::string grammar_)
Set grammar for constrained generation.
virtual ~LLM()=default
Virtual destructor.
virtual std::string completion_json(const json &data, CharArrayFn callback, bool callbackWithJSON)=0
Generate text completion.
virtual std::string detokenize_json(const json &data)=0
Convert tokens back to text.
virtual std::string get_grammar()
Get current grammar specification.
File with basic definitions.
const char * LLM_Lora_List(LLMProvider *llm)
List LoRA adapters (C API)
const char * LLM_Get_Grammar(LLM *llm)
Get grammar (C API)
void LLM_Stop(LLMProvider *llm)
Stop LLM service (C API)
void LLM_Enable_Reasoning(LLMProvider *llm, bool enable_reasoning)
Enable reasoning (C API)
const char * LLM_Save_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Save slot state (C API)
void LLM_Logging_Callback(CharArrayFn callback)
Set global logging callback (C API)
const char * LLM_Load_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Load slot state (C API)
void LLM_Join_Service(LLMProvider *llm)
Wait for service to complete (C API)
void LLM_Set_SSL(LLMProvider *llm, const char *SSL_cert, const char *SSL_key)
Set SSL configuration (C API)
bool Has_GPU_Layers(const char *command)
Check if command has GPU layers (C API)
const char * LLM_Status_Message()
Get last operation status message (C API)
void LLM_Set_Grammar(LLM *llm, const char *grammar="")
Set grammar (C API)
const char * LLM_Apply_Template(LLM *llm, const char *messages_as_json)
Apply chat template (C API)
void LLM_Cancel(LLMLocal *llm, int id_slot)
Cancel request (C API)
void LLM_Logging_Stop()
Stop global logging (C API)
void LLM_Start(LLMProvider *llm)
Start LLM service (C API)
void LLM_Set_Completion_Parameters(LLM *llm, const char *params_json="{}")
Set completion parameters (C API)
const int LLM_Embedding_Size(LLMProvider *llm)
Get embedding vector size (C API)
void LLM_Delete(LLMProvider *llm)
Delete LLM provider (C API)
void LLM_Debug(int debug_level)
Set global debug level (C API)
bool LLM_Lora_Weight(LLMProvider *llm, const char *loras_as_json)
Configure LoRA weights (C API)
const char * LLM_Completion(LLM *llm, const char *prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false)
Generate completion (C API)
const char * LLM_Tokenize(LLM *llm, const char *query)
Tokenize text (C API)
void LLM_Join_Server(LLMProvider *llm)
Wait for server to complete (C API)
const bool LLM_Started(LLMProvider *llm)
Check if service is started (C API)
const char * LLM_Get_Completion_Parameters(LLM *llm)
Get completion parameters (C API)
const char * LLM_Embeddings(LLM *llm, const char *query)
Generate embeddings (C API)
const int LLM_Status_Code()
Get last operation status code (C API)
void LLM_Stop_Server(LLMProvider *llm)
Stop HTTP server (C API)
const char * LLM_Detokenize(LLM *llm, const char *tokens_as_json)
Detokenize tokens (C API)
void LLM_Start_Server(LLMProvider *llm, const char *host="0.0.0.0", int port=-1, const char *API_key="")
Start HTTP server (C API)
Structure representing a LoRA adapter with ID, scale, and file path.
bool operator==(const LoraIdScalePath &other) const
Equality comparison operator.
std::string path
Filesystem path to the LoRA adapter file.
int id
Unique identifier for the LoRA adapter.
float scale
Scale factor for the LoRA adapter.
Structure representing a LoRA adapter with ID and scale.
bool operator==(const LoraIdScale &other) const
Equality comparison operator.
float scale
Scale factor for the LoRA adapter (typically 0.0 to 1.0)
int id
Unique identifier for the LoRA adapter.