![]() |
LlamaLib
v2.0.2
Cross-platform library for local LLMs
|
Runtime loader for LLM libraries. More...
#include <LLM_runtime.h>
Public Member Functions | |
| LLMService () | |
| Default constructor. | |
| LLMService (const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={}) | |
| Parameterized constructor. | |
| ~LLMService () | |
| Destructor. | |
| bool | create_LLM_library (const std::string &command) |
| Loads LLM library dynamically according to underlying achitecture and creates a LLM based on the command. | |
| std::string | tokenize_json (const json &data) override |
| Tokenize input (override) | |
| std::string | detokenize_json (const json &data) override |
| Convert tokens back to text. | |
| std::string | embeddings_json (const json &data) override |
| Generate embeddings with HTTP response support. | |
| std::string | completion_json (const json &data, CharArrayFn callback=nullptr, bool callbackWithJSON=true) override |
| Generate completion (override - delegates to loaded library) | |
| std::string | apply_template_json (const json &data) override |
| Apply a chat template to message data. | |
| void | cancel (int id_slot) override |
| Cancel request (override - delegates to loaded library) | |
| std::string | lora_weight_json (const json &data) override |
| Configure LoRA weights with HTTP response support. | |
| std::string | lora_list_json () override |
| List available LoRA adapters. | |
| std::string | slot_json (const json &data) override |
| Manage slots with HTTP response support. | |
| void | start_server (const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="") override |
| Start HTTP server (override - delegates to loaded library) | |
| void | stop_server () override |
| Stop HTTP server (override - delegates to loaded library) | |
| void | start () override |
| Start service (override - delegates to loaded library) | |
| bool | started () override |
| Check service status (override - delegates to loaded library) | |
| void | stop () override |
| Stop service (override - delegates to loaded library) | |
| void | join_service () override |
| Wait for service completion (override - delegates to loaded library) | |
| void | join_server () override |
| Wait for server completion (override - delegates to loaded library) | |
| void | set_SSL (const std::string &cert, const std::string &key) override |
| Set SSL configuration (override - delegates to loaded library) | |
| int | embedding_size () override |
| Get embedding size (override - delegates to loaded library) | |
| int | get_next_available_slot () override |
| Get available slot (override - delegates to loaded library) | |
| void | debug (int debug_level) override |
| Set debug level (override - delegates to loaded library) | |
| void | logging_callback (CharArrayFn callback) override |
| Set logging callback (override - delegates to loaded library) | |
| std::string | debug_implementation () override |
| Implementation debugging. | |
| LLMService () | |
| Default constructor. | |
| LLMService (const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={}) | |
| Parameterized constructor. | |
| ~LLMService () | |
| Destructor. | |
| void | init (int argc, char **argv) |
| Initialize from argc/argv parameters. | |
| void | init (const std::string ¶ms_string) |
| Initialize from parameter string. | |
| void | init (const char *params_string) |
| Initialize from C-style parameter string. | |
| std::string | get_command () |
| Returns the construct command. | |
| std::string | encapsulate_route (const json &body, handler_t route_handler) |
| void | enable_reasoning (bool reasoning) override |
| enable reasoning | |
| std::string | tokenize_json (const json &data) override |
| Tokenize input (override) | |
| std::string | detokenize_json (const json &data) override |
| Convert tokens back to text. | |
| std::string | embeddings_json (const json &data) override |
| Generate embeddings with HTTP response support. | |
| std::string | apply_template_json (const json &data) override |
| Apply a chat template to message data. | |
| std::string | completion_json (const json &data, CharArrayFn callback=nullptr, bool callbackWithJSON=true) override |
| Generate completion (override) | |
| std::string | slot_json (const json &data) override |
| Manage slots with HTTP response support. | |
| std::string | lora_weight_json (const json &data) override |
| Configure LoRA weights with HTTP response support. | |
| std::string | lora_list_json () override |
| List available LoRA adapters. | |
| void | cancel (int id_slot) override |
| Cancel running request (override) | |
| void | start () override |
| Start the LLM service (override) | |
| bool | started () override |
| Check service status (override) | |
| void | stop () override |
| Stop the LLM service (override) | |
| void | start_server (const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="") override |
| Start HTTP server (override) | |
| void | stop_server () override |
| Stop HTTP server (override) | |
| void | join_service () override |
| Wait for service thread completion (override) | |
| void | join_server () override |
| Wait for server thread completion (override) | |
| void | set_SSL (const std::string &SSL_cert, const std::string &SSL_key) override |
| Configure SSL certificates (override) | |
| int | embedding_size () override |
| Get embedding vector dimensions (override) | |
| int | get_next_available_slot () override |
| Get available processing slot (override) | |
| void | debug (int debug_level) override |
| Set debug level (override) | |
| void | logging_callback (CharArrayFn callback) override |
| Set logging callback (override) | |
| std::string | debug_implementation () override |
| Implementation debugging. | |
Public Member Functions inherited from LLMProvider | |
| virtual | ~LLMProvider () |
| Virtual destructor. | |
| virtual bool | lora_weight (const std::vector< LoraIdScale > &loras) |
| Configure LoRA weights. | |
| virtual std::vector< LoraIdScalePath > | lora_list () |
| List available LoRA adapters. | |
| virtual void | logging_stop () |
| Stop logging. | |
Public Member Functions inherited from LLMLocal | |
| virtual std::string | save_slot (int id_slot, const std::string &filepath) |
| Save slot state to file. | |
| virtual std::string | load_slot (int id_slot, const std::string &filepath) |
| Load slot state from file. | |
Public Member Functions inherited from LLM | |
| virtual | ~LLM ()=default |
| Virtual destructor. | |
| virtual std::vector< int > | tokenize (const std::string &query) |
| Tokenize text. | |
| virtual std::string | detokenize (const std::vector< int32_t > &tokens) |
| Convert tokens to text. | |
| virtual std::vector< float > | embeddings (const std::string &query) |
| Generate embeddings. | |
| virtual void | set_completion_params (json completion_params_) |
| Set completion parameters. | |
| virtual std::string | get_completion_params () |
| Get current completion parameters. | |
| virtual std::string | completion (const std::string &prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false) |
| Generate completion. | |
| virtual void | set_grammar (std::string grammar_) |
| Set grammar for constrained generation. | |
| virtual std::string | get_grammar () |
| Get current grammar specification. | |
| virtual std::string | apply_template (const json &messages) |
| Apply template to messages. | |
Static Public Member Functions | |
| static LLMService * | from_command (const std::string &command) |
| Create runtime from command line string. | |
| static LLMService * | from_command (int argc, char **argv) |
| Create runtime from argc/argv. | |
| static LLMService * | from_params (const json ¶ms_json) |
| Create LLMService from JSON parameters. | |
| static LLMService * | from_command (const std::string &command) |
| Create LLMService from command line string. | |
| static LLMService * | from_command (int argc, char **argv) |
| Create LLMService from argc/argv. | |
| static std::vector< char * > | jsonToArguments (const json ¶ms_json) |
| Convert JSON parameters to command line arguments. | |
Static Public Member Functions inherited from LLM | |
| static bool | has_gpu_layers (const std::string &command) |
| Check if command line arguments specify GPU layers. | |
| static std::string | LLM_args_to_command (const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={}) |
| Convert LLM parameters to command line arguments. | |
Public Attributes | |
| LibHandle | handle = nullptr |
| Handle to loaded library. | |
| LLMProvider * | llm = nullptr |
| Pointer to loaded LLM provider instance. | |
Public Attributes inherited from LLM | |
| int32_t | n_keep = 0 |
| Number of tokens to keep from the beginning of the context. | |
| std::string | grammar = "" |
| Grammar specification in GBNF format or JSON schema. | |
| json | completion_params |
| JSON object containing completion parameters. | |
Protected Member Functions | |
| bool | create_LLM_library_backend (const std::string &command, const std::string &llm_lib_filename) |
| Load LLM library backend. | |
Protected Member Functions inherited from LLMProvider | |
| virtual bool | parse_lora_weight_json (const json &result) |
| Parse LoRA weight configuration result. | |
| virtual json | build_lora_weight_json (const std::vector< LoraIdScale > &loras) |
| Build JSON for LoRA weight configuration. | |
| virtual std::vector< LoraIdScalePath > | parse_lora_list_json (const json &result) |
| Parse LoRA list result. | |
| virtual json | build_lora_list_json (const std::vector< LoraIdScalePath > &loras) |
| Build JSON for LoRA list result. | |
Protected Member Functions inherited from LLMLocal | |
| virtual std::string | slot (int id_slot, const std::string &action, const std::string &filepath) |
| Perform slot operation. | |
| virtual json | build_slot_json (int id_slot, const std::string &action, const std::string &filepath) |
| Build JSON for slot operations. | |
| virtual std::string | parse_slot_json (const json &result) |
| Parse slot operation result. | |
Protected Member Functions inherited from LLM | |
| virtual json | build_apply_template_json (const json &messages) |
| Build JSON for template application. | |
| virtual std::string | parse_apply_template_json (const json &result) |
| Parse template application result. | |
| virtual json | build_tokenize_json (const std::string &query) |
| Build JSON for tokenization. | |
| virtual std::vector< int > | parse_tokenize_json (const json &result) |
| Parse tokenization result. | |
| virtual json | build_detokenize_json (const std::vector< int32_t > &tokens) |
| Build JSON for detokenization. | |
| virtual std::string | parse_detokenize_json (const json &result) |
| Parse detokenization result. | |
| virtual json | build_embeddings_json (const std::string &query) |
| Build JSON for embeddings generation. | |
| virtual std::vector< float > | parse_embeddings_json (const json &result) |
| Parse embeddings result. | |
| virtual json | build_completion_json (const std::string &prompt, int id_slot=-1) |
| Build JSON for completion generation. | |
| virtual std::string | parse_completion_json (const json &result) |
| Parse completion result. | |
Protected Attributes | |
| std::vector< std::string > | search_paths |
| Library search paths. | |
Protected Attributes inherited from LLMProvider | |
| bool | reasoning_enabled = false |
| Whether reasoning is enabled. | |
Runtime loader for LLM libraries.
Concrete implementation of LLMProvider with server capabilities.
This class provides dynamic loading of LLM backend libraries, allowing for flexible deployment and architecture-specific optimizations
This class provides a full-featured LLM service with HTTP server, parameter configuration, and backend integration with llama.cpp
Definition at line 62 of file LLM_runtime.h.
| LLMService::LLMService | ( | ) |
Default constructor.
Creates an uninitialized runtime that must load a library before use
Definition at line 315 of file LLM_runtime.cpp.
| LLMService::LLMService | ( | const std::string & | model_path, |
| int | num_slots = 1, | ||
| int | num_threads = -1, | ||
| int | num_GPU_layers = 0, | ||
| bool | flash_attention = false, | ||
| int | context_size = 4096, | ||
| int | batch_size = 2048, | ||
| bool | embedding_only = false, | ||
| const std::vector< std::string > & | lora_paths = {} ) |
Parameterized constructor.
| model_path | Path to the model file |
| num_threads | Number of CPU threads (-1 for auto-detection) |
| num_GPU_layers | Number of layers to offload to GPU |
| num_slots | Number of parallel slots |
| flash_attention | Whether to enable flash attention optimization |
| context_size | Maximum context length in tokens |
| batch_size | Processing batch size |
| embedding_only | Whether to run in embedding-only mode |
| lora_paths | Vector of paths to LoRA adapter files |
Creates and initializes a runtime with the specified parameters
Definition at line 320 of file LLM_runtime.cpp.
| LLMService::~LLMService | ( | ) |
Destructor.
Definition at line 339 of file LLM_runtime.cpp.
| LLMService::LLMService | ( | ) |
Default constructor.
Creates an uninitialized LLMService that must be configured before use
| LLMService::LLMService | ( | const std::string & | model_path, |
| int | num_slots = 1, | ||
| int | num_threads = -1, | ||
| int | num_GPU_layers = 0, | ||
| bool | flash_attention = false, | ||
| int | context_size = 4096, | ||
| int | batch_size = 2048, | ||
| bool | embedding_only = false, | ||
| const std::vector< std::string > & | lora_paths = {} ) |
Parameterized constructor.
| model_path | Path to the model file |
| num_threads | Number of CPU threads (-1 for auto-detection) |
| num_GPU_layers | Number of layers to offload to GPU |
| num_slots | Number of parallel processing sequences |
| flash_attention | Whether to enable flash attention optimization |
| context_size | Maximum context length in tokens |
| batch_size | Processing batch size |
| embedding_only | Whether to run in embedding-only mode |
| lora_paths | Vector of paths to LoRA adapter files |
|
inlineoverridevirtual |
Apply a chat template to message data.
| data | JSON object containing messages to format |
Pure virtual method for applying chat templates to conversation data
Implements LLM.
Definition at line 137 of file LLM_runtime.h.
|
overridevirtual |
Apply a chat template to message data.
| data | JSON object containing messages to format |
Pure virtual method for applying chat templates to conversation data
Implements LLM.
|
inlineoverridevirtual |
Cancel request (override - delegates to loaded library)
| data | JSON cancellation request |
Implements LLMLocal.
Definition at line 141 of file LLM_runtime.h.
|
overridevirtual |
Cancel running request (override)
| data | JSON object with cancellation parameters |
Implements LLMLocal.
|
inlineoverridevirtual |
Generate completion (override - delegates to loaded library)
| data | JSON completion request |
| callback | Optional streaming callback |
| callbackWithJSON | Whether callback uses JSON |
Implements LLM.
Definition at line 131 of file LLM_runtime.h.
|
overridevirtual |
Generate completion (override)
| data | JSON object with prompt and parameters |
| callback | Optional streaming callback function |
| callbackWithJSON | Whether callback receives JSON format |
Implements LLM.
| bool LLMService::create_LLM_library | ( | const std::string & | command | ) |
Loads LLM library dynamically according to underlying achitecture and creates a LLM based on the command.
| command | Command string containing model path and parameters |
Definition at line 296 of file LLM_runtime.cpp.
|
protected |
Load LLM library backend.
| command | Command string with parameters |
| llm_lib_filename | Specific library filename to load |
Internal method for loading specific library files
Definition at line 218 of file LLM_runtime.cpp.
|
inlineoverridevirtual |
Set debug level (override - delegates to loaded library)
| debug_level | Debug verbosity level |
Implements LLMProvider.
Definition at line 202 of file LLM_runtime.h.
|
overridevirtual |
|
inlineoverridevirtual |
Implementation debugging.
Implements LLMProvider.
Definition at line 208 of file LLM_runtime.h.
|
inlineoverridevirtual |
Implementation debugging.
Implements LLMProvider.
Definition at line 201 of file LLM_service.h.
|
inlineoverridevirtual |
Convert tokens back to text.
| data | JSON object containing token IDs |
Pure virtual method for converting token sequences back to text
Implements LLM.
Definition at line 118 of file LLM_runtime.h.
|
overridevirtual |
Convert tokens back to text.
| data | JSON object containing token IDs |
Pure virtual method for converting token sequences back to text
Implements LLM.
|
inlineoverridevirtual |
Get embedding size (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 194 of file LLM_runtime.h.
|
overridevirtual |
Get embedding vector dimensions (override)
Implements LLMProvider.
|
inlineoverridevirtual |
Generate embeddings with HTTP response support.
| data | JSON object containing embedding request |
Protected method used internally for server-based embedding generation
Implements LLM.
Definition at line 124 of file LLM_runtime.h.
|
overridevirtual |
Generate embeddings with HTTP response support.
| data | JSON object containing embedding request |
Protected method used internally for server-based embedding generation
Implements LLM.
|
overridevirtual |
enable reasoning
| reasoning | whether to enable reasoning |
Reimplemented from LLMProvider.
Definition at line 262 of file LLM_service.cpp.
| std::string LLMService::encapsulate_route | ( | const json & | body, |
| handler_t | route_handler ) |
Definition at line 520 of file LLM_service.cpp.
|
static |
Create runtime from command line string.
| command | Command line argument string |
Factory method for creating runtime instances from command arguments. See https://github.com/ggml-org/llama.cpp/tree/master/tools/server#usage for arguments.
Definition at line 327 of file LLM_runtime.cpp.
|
static |
Create LLMService from command line string.
| command | Command line argument string |
Factory method for creating instances from command line arguments See https://github.com/ggml-org/llama.cpp/tree/master/tools/server#usage for arguments.
|
static |
Create runtime from argc/argv.
| argc | Argument count |
| argv | Argument vector |
Factory method for creating runtime instances from main() parameters
Definition at line 334 of file LLM_runtime.cpp.
|
static |
Create LLMService from argc/argv.
| argc | Argument count |
| argv | Argument vector |
Factory method for creating instances from standard main() parameters
|
static |
Create LLMService from JSON parameters.
| params_json | JSON object containing initialization parameters |
Factory method for creating instances from structured parameter data See https://github.com/ggml-org/llama.cpp/tree/master/tools/server#usage for arguments.
Definition at line 22 of file LLM_service.cpp.
|
inline |
Returns the construct command.
Definition at line 97 of file LLM_service.h.
|
inlineoverridevirtual |
Get available slot (override - delegates to loaded library)
Implements LLMLocal.
Definition at line 198 of file LLM_runtime.h.
|
overridevirtual |
Get available processing slot (override)
Implements LLMLocal.
| void LLMService::init | ( | const char * | params_string | ) |
Initialize from C-style parameter string.
| params_string | C-style string containing parameters |
C-compatible version of string parameter initialization
Definition at line 175 of file LLM_service.cpp.
| void LLMService::init | ( | const std::string & | params_string | ) |
Initialize from parameter string.
| params_string | String containing space-separated parameters |
Initialize the service by parsing a parameter string
Definition at line 160 of file LLM_service.cpp.
| void LLMService::init | ( | int | argc, |
| char ** | argv ) |
Initialize from argc/argv parameters.
| argc | Argument count |
| argv | Argument vector |
Initialize the service with command line style parameters
Definition at line 180 of file LLM_service.cpp.
|
inlineoverridevirtual |
Wait for server completion (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 185 of file LLM_runtime.h.
|
overridevirtual |
Wait for server thread completion (override)
Implements LLMProvider.
|
inlineoverridevirtual |
Wait for service completion (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 182 of file LLM_runtime.h.
|
overridevirtual |
Wait for service thread completion (override)
Implements LLMProvider.
|
static |
Convert JSON parameters to command line arguments.
| params_json | JSON object with parameters |
Utility function for converting structured parameters to argv format
Definition at line 58 of file LLM_service.cpp.
|
inlineoverridevirtual |
Set logging callback (override - delegates to loaded library)
| callback | Function to receive log messages |
Implements LLMProvider.
Definition at line 206 of file LLM_runtime.h.
|
overridevirtual |
Set logging callback (override)
| callback | Function to receive log messages |
Implements LLMProvider.
|
inlineoverridevirtual |
List available LoRA adapters.
Implements LLMProvider.
Definition at line 151 of file LLM_runtime.h.
|
overridevirtual |
List available LoRA adapters.
Implements LLMProvider.
|
inlineoverridevirtual |
Configure LoRA weights with HTTP response support.
| data | JSON object with LoRA configuration |
Protected method used internally for server-based LoRA configuration
Implements LLMProvider.
Definition at line 147 of file LLM_runtime.h.
|
overridevirtual |
Configure LoRA weights with HTTP response support.
| data | JSON object with LoRA configuration |
Protected method used internally for server-based LoRA configuration
Implements LLMProvider.
|
inlineoverridevirtual |
Set SSL configuration (override - delegates to loaded library)
| cert | SSL certificate path |
| key | SSL private key path |
Implements LLMProvider.
Definition at line 190 of file LLM_runtime.h.
|
overridevirtual |
Configure SSL certificates (override)
| SSL_cert | Path to SSL certificate file |
| SSL_key | Path to SSL private key file |
Implements LLMProvider.
|
inlineoverridevirtual |
Manage slots with HTTP response support.
| data | JSON object with slot operation |
Protected method used internally for server-based slot management
Implements LLMLocal.
Definition at line 157 of file LLM_runtime.h.
|
overridevirtual |
Manage slots with HTTP response support.
| data | JSON object with slot operation |
Protected method used internally for server-based slot management
Implements LLMLocal.
|
inlineoverridevirtual |
Start service (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 169 of file LLM_runtime.h.
|
overridevirtual |
Start the LLM service (override)
Implements LLMProvider.
|
inlineoverridevirtual |
Start HTTP server (override - delegates to loaded library)
| host | Host address (default: "0.0.0.0") |
| port | Port number (0 for auto) |
| API_key | Optional API key |
Implements LLMProvider.
Definition at line 163 of file LLM_runtime.h.
|
overridevirtual |
Start HTTP server (override)
| host | Host address to bind (default: "0.0.0.0") |
| port | Port number (0 for auto-selection) |
| API_key | Optional API key for authentication |
Implements LLMProvider.
|
inlineoverridevirtual |
Check service status (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 173 of file LLM_runtime.h.
|
overridevirtual |
Check service status (override)
Implements LLMProvider.
|
inlineoverridevirtual |
Stop service (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 176 of file LLM_runtime.h.
|
overridevirtual |
Stop the LLM service (override)
Implements LLMProvider.
|
inlineoverridevirtual |
Stop HTTP server (override - delegates to loaded library)
Implements LLMProvider.
Definition at line 166 of file LLM_runtime.h.
|
overridevirtual |
Stop HTTP server (override)
Implements LLMProvider.
|
inlineoverridevirtual |
Tokenize input (override)
| data | JSON object containing text to tokenize |
Implements LLM.
Definition at line 112 of file LLM_runtime.h.
|
overridevirtual |
Tokenize input (override)
| data | JSON object containing text to tokenize |
Implements LLM.
| LibHandle LLMService::handle = nullptr |
Handle to loaded library.
Definition at line 99 of file LLM_runtime.h.
| LLMProvider* LLMService::llm = nullptr |
Pointer to loaded LLM provider instance.
Definition at line 100 of file LLM_runtime.h.
|
protected |
Library search paths.
Definition at line 219 of file LLM_runtime.h.