|
| virtual int | get_next_available_slot ()=0 |
| | Get an available processing slot.
|
| |
| virtual std::string | save_slot (int id_slot, const std::string &filepath) |
| | Save slot state to file.
|
| |
| virtual std::string | load_slot (int id_slot, const std::string &filepath) |
| | Load slot state from file.
|
| |
| virtual void | cancel (int id_slot)=0 |
| | Cancel request.
|
| |
| virtual std::string | slot_json (const json &data)=0 |
| | Manage slots with HTTP response support.
|
| |
|
virtual | ~LLM ()=default |
| | Virtual destructor.
|
| |
| virtual std::vector< int > | tokenize (const std::string &query) |
| | Tokenize text.
|
| |
| virtual std::string | tokenize_json (const json &data)=0 |
| | Tokenize input (override)
|
| |
| virtual std::string | detokenize (const std::vector< int32_t > &tokens) |
| | Convert tokens to text.
|
| |
| virtual std::string | detokenize_json (const json &data)=0 |
| | Convert tokens back to text.
|
| |
| virtual std::vector< float > | embeddings (const std::string &query) |
| | Generate embeddings.
|
| |
| virtual std::string | embeddings_json (const json &data)=0 |
| | Generate embeddings with HTTP response support.
|
| |
| virtual void | set_completion_params (json completion_params_) |
| | Set completion parameters.
|
| |
| virtual std::string | get_completion_params () |
| | Get current completion parameters.
|
| |
| virtual std::string | completion (const std::string &prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false) |
| | Generate completion.
|
| |
| virtual std::string | completion_json (const json &data, CharArrayFn callback, bool callbackWithJSON)=0 |
| | Generate text completion.
|
| |
| virtual void | set_grammar (std::string grammar_) |
| | Set grammar for constrained generation.
|
| |
| virtual std::string | get_grammar () |
| | Get current grammar specification.
|
| |
| virtual std::string | apply_template (const json &messages) |
| | Apply template to messages.
|
| |
| virtual std::string | apply_template_json (const json &data)=0 |
| | Apply a chat template to message data.
|
| |
|
| virtual std::string | slot (int id_slot, const std::string &action, const std::string &filepath) |
| | Perform slot operation.
|
| |
| virtual json | build_slot_json (int id_slot, const std::string &action, const std::string &filepath) |
| | Build JSON for slot operations.
|
| |
| virtual std::string | parse_slot_json (const json &result) |
| | Parse slot operation result.
|
| |
| virtual json | build_apply_template_json (const json &messages) |
| | Build JSON for template application.
|
| |
| virtual std::string | parse_apply_template_json (const json &result) |
| | Parse template application result.
|
| |
| virtual json | build_tokenize_json (const std::string &query) |
| | Build JSON for tokenization.
|
| |
| virtual std::vector< int > | parse_tokenize_json (const json &result) |
| | Parse tokenization result.
|
| |
| virtual json | build_detokenize_json (const std::vector< int32_t > &tokens) |
| | Build JSON for detokenization.
|
| |
| virtual std::string | parse_detokenize_json (const json &result) |
| | Parse detokenization result.
|
| |
| virtual json | build_embeddings_json (const std::string &query) |
| | Build JSON for embeddings generation.
|
| |
| virtual std::vector< float > | parse_embeddings_json (const json &result) |
| | Parse embeddings result.
|
| |
| virtual json | build_completion_json (const std::string &prompt, int id_slot=-1) |
| | Build JSON for completion generation.
|
| |
| virtual std::string | parse_completion_json (const json &result) |
| | Parse completion result.
|
| |
|
| static bool | has_gpu_layers (const std::string &command) |
| | Check if command line arguments specify GPU layers.
|
| |
| static std::string | LLM_args_to_command (const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={}) |
| | Convert LLM parameters to command line arguments.
|
| |
| int32_t | n_keep = 0 |
| | Number of tokens to keep from the beginning of the context.
|
| |
| std::string | grammar = "" |
| | Grammar specification in GBNF format or JSON schema.
|
| |
| json | completion_params |
| | JSON object containing completion parameters.
|
| |
Abstract class for local LLM operations with slot management.
Extends the base LLM class with local-specific functionality including slot management for concurrent requests and state persistence
Definition at line 221 of file LLM.h.