LlamaLib/LLM_8h_source.html

#pragma once


#include "defs.h"

#include "error_handling.h"

#include <sstream>

#if defined(__APPLE__)

#include <TargetConditionals.h>

#endif


struct LoraIdScale

{

    int id;

    float scale;


    bool operator==(const LoraIdScale &other) const

    {

        return id == other.id && scale == other.scale;

    }


};


struct LoraIdScalePath

{

    int id;

    float scale;

    std::string path;


    bool operator==(const LoraIdScalePath &other) const

    {

        return id == other.id && scale == other.scale && path == other.path;

    }


};


void ensure_error_handlers_initialized();


class UNDREAMAI_API LLM

{

public:

    int32_t n_keep = 0;

    std::string grammar = "";

    json completion_params;


    virtual ~LLM() = default;


    virtual std::vector<int> tokenize(const std::string &query);


    virtual std::string tokenize_json(const json &data) = 0;


    virtual std::string detokenize(const std::vector<int32_t> &tokens);


    virtual std::string detokenize_json(const json &data) = 0;


    virtual std::vector<float> embeddings(const std::string &query);


    virtual std::string embeddings_json(const json &data) = 0;


    //  See https://github.com/ggml-org/llama.cpp/tree/master/tools/server#post-completion-given-a-prompt-it-returns-the-predicted-completion for the different parameters

    virtual void set_completion_params(json completion_params_) { completion_params = completion_params_; }


    virtual std::string get_completion_params() { return completion_params; }


    virtual std::string completion(const std::string &prompt, CharArrayFn callback = nullptr, int id_slot = -1, bool return_response_json = false);


    virtual std::string completion_json(const json &data, CharArrayFn callback, bool callbackWithJSON) = 0;


    virtual void set_grammar(std::string grammar_) { grammar = grammar_; }


    virtual std::string get_grammar() { return grammar; }


    virtual std::string apply_template(const json &messages);


    virtual std::string apply_template_json(const json &data) = 0;


    static bool has_gpu_layers(const std::string &command);


    static std::string LLM_args_to_command(const std::string &model_path, int num_slots = 1, int num_threads = -1, int num_GPU_layers = 0, bool flash_attention = false, int context_size = 4096, int batch_size = 2048, bool embedding_only = false, const std::vector<std::string> &lora_paths = {});


protected:

    virtual json build_apply_template_json(const json &messages);


    virtual std::string parse_apply_template_json(const json &result);


    virtual json build_tokenize_json(const std::string &query);


    virtual std::vector<int> parse_tokenize_json(const json &result);


    virtual json build_detokenize_json(const std::vector<int32_t> &tokens);


    virtual std::string parse_detokenize_json(const json &result);


    virtual json build_embeddings_json(const std::string &query);


    virtual std::vector<float> parse_embeddings_json(const json &result);


    virtual json build_completion_json(const std::string &prompt, int id_slot = -1);


    virtual std::string parse_completion_json(const json &result);

};


class UNDREAMAI_API LLMLocal : public LLM

{

public:

    virtual int get_next_available_slot() = 0;


    virtual std::string save_slot(int id_slot, const std::string &filepath) { return slot(id_slot, "save", filepath); }


    virtual std::string load_slot(int id_slot, const std::string &filepath) { return slot(id_slot, "restore", filepath); }


    virtual void cancel(int id_slot) = 0;


    virtual std::string slot_json(const json &data) = 0;


protected:

    virtual std::string slot(int id_slot, const std::string &action, const std::string &filepath);


    virtual json build_slot_json(int id_slot, const std::string &action, const std::string &filepath);


    virtual std::string parse_slot_json(const json &result);

};


class UNDREAMAI_API LLMProvider : public LLMLocal

{

public:

    virtual ~LLMProvider();


    virtual bool lora_weight(const std::vector<LoraIdScale> &loras);


    virtual std::string lora_weight_json(const json &data) = 0;


    virtual std::vector<LoraIdScalePath> lora_list();


    virtual std::string lora_list_json() = 0;


    virtual void enable_reasoning(bool reasoning) { reasoning_enabled = reasoning; }


    virtual void debug(int debug_level) = 0;


    virtual void logging_callback(CharArrayFn callback) = 0;


    virtual void logging_stop();


    virtual void start() = 0;


    virtual bool started() = 0;


    virtual void stop() = 0;


    virtual void start_server(const std::string &host = "0.0.0.0", int port = -1, const std::string &API_key = "") = 0;


    virtual void stop_server() = 0;


    virtual void join_service() = 0;


    virtual void join_server() = 0;


    virtual void set_SSL(const std::string &SSL_cert, const std::string &SSL_key) = 0;


    virtual int embedding_size() = 0;


    virtual std::string debug_implementation() = 0;


protected:

    bool reasoning_enabled = false;


    virtual bool parse_lora_weight_json(const json &result);


    virtual json build_lora_weight_json(const std::vector<LoraIdScale> &loras);


    virtual std::vector<LoraIdScalePath> parse_lora_list_json(const json &result);


    virtual json build_lora_list_json(const std::vector<LoraIdScalePath> &loras);

};


class LLMProviderRegistry

{

public:

    static bool initialised;


    static void inject_registry(LLMProviderRegistry *instance)

    {

        custom_instance_ = instance;

        initialised = true;

    }


    static LLMProviderRegistry &instance()

    {

        if (custom_instance_)

            return *custom_instance_;


        static LLMProviderRegistry registry;

        initialised = true;

        return registry;

    }


    void register_instance(LLMProvider *service)

    {

        std::lock_guard<std::mutex> lock(mutex_);

        instances_.push_back(service);

    }


    void unregister_instance(LLMProvider *service)

    {

        std::lock_guard<std::mutex> lock(mutex_);

        instances_.erase(std::remove(instances_.begin(), instances_.end(), service), instances_.end());

    }


    std::vector<LLMProvider *> get_instances()

    {

        std::lock_guard<std::mutex> lock(mutex_);

        return instances_;

    }


    void set_debug_level(int level)

    {

        debug_level_ = level;

    }


    const int get_debug_level()

    {

        return debug_level_;

    }


    void set_log_callback(CharArrayFn callback)

    {

        log_callback_ = callback;

    }


    const CharArrayFn get_log_callback()

    {

        return log_callback_;

    }


private:

    static LLMProviderRegistry *custom_instance_;


    std::mutex mutex_;

    std::vector<LLMProvider *> instances_;

    int debug_level_ = 0;

    CharArrayFn log_callback_ = nullptr;


    LLMProviderRegistry() = default;

    ~LLMProviderRegistry() = default;

    LLMProviderRegistry(const LLMProviderRegistry &) = delete;

    LLMProviderRegistry &operator=(const LLMProviderRegistry &) = delete;

};


extern "C"

{

    UNDREAMAI_API bool Has_GPU_Layers(const char *command);


    UNDREAMAI_API void LLM_Debug(int debug_level);


    UNDREAMAI_API void LLM_Logging_Callback(CharArrayFn callback);


    UNDREAMAI_API void LLM_Logging_Stop();


#ifdef _DEBUG

    UNDREAMAI_API const bool IsDebuggerAttached(void);

#endif


    UNDREAMAI_API void LLM_Set_Completion_Parameters(LLM *llm, const char *params_json = "{}");


    UNDREAMAI_API const char *LLM_Get_Completion_Parameters(LLM *llm);


    UNDREAMAI_API void LLM_Set_Grammar(LLM *llm, const char *grammar = "");


    UNDREAMAI_API const char *LLM_Get_Grammar(LLM *llm);


    UNDREAMAI_API const char *LLM_Apply_Template(LLM *llm, const char *messages_as_json);


    UNDREAMAI_API const char *LLM_Tokenize(LLM *llm, const char *query);


    UNDREAMAI_API const char *LLM_Detokenize(LLM *llm, const char *tokens_as_json);


    UNDREAMAI_API const char *LLM_Embeddings(LLM *llm, const char *query);


    UNDREAMAI_API const char *LLM_Completion(LLM *llm, const char *prompt, CharArrayFn callback = nullptr, int id_slot = -1, bool return_response_json = false);


    UNDREAMAI_API const char *LLM_Save_Slot(LLMLocal *llm, int id_slot, const char *filepath);


    UNDREAMAI_API const char *LLM_Load_Slot(LLMLocal *llm, int id_slot, const char *filepath);


    UNDREAMAI_API void LLM_Cancel(LLMLocal *llm, int id_slot);


    UNDREAMAI_API bool LLM_Lora_Weight(LLMProvider *llm, const char *loras_as_json);


    UNDREAMAI_API void LLM_Enable_Reasoning(LLMProvider *llm, bool enable_reasoning);


    UNDREAMAI_API const char *LLM_Lora_List(LLMProvider *llm);


    UNDREAMAI_API void LLM_Delete(LLMProvider *llm);


    UNDREAMAI_API void LLM_Start(LLMProvider *llm);


    UNDREAMAI_API const bool LLM_Started(LLMProvider *llm);


    UNDREAMAI_API void LLM_Stop(LLMProvider *llm);


    UNDREAMAI_API void LLM_Start_Server(LLMProvider *llm, const char *host = "0.0.0.0", int port = -1, const char *API_key = "");


    UNDREAMAI_API void LLM_Stop_Server(LLMProvider *llm);


    UNDREAMAI_API void LLM_Join_Service(LLMProvider *llm);


    UNDREAMAI_API void LLM_Join_Server(LLMProvider *llm);


    UNDREAMAI_API void LLM_Set_SSL(LLMProvider *llm, const char *SSL_cert, const char *SSL_key);


    UNDREAMAI_API const int LLM_Status_Code();


    UNDREAMAI_API const char *LLM_Status_Message();


    UNDREAMAI_API const int LLM_Embedding_Size(LLMProvider *llm);

}


ensure_error_handlers_initialized
void ensure_error_handlers_initialized()
Ensures error handlers are properly initialized.
Definition LLM.cpp:25

LLMLocal
Abstract class for local LLM operations with slot management.
Definition LLM.h:222

LLMLocal::slot_json
virtual std::string slot_json(const json &data)=0
Manage slots with HTTP response support.

LLMLocal::load_slot
virtual std::string load_slot(int id_slot, const std::string &filepath)
Load slot state from file.
Definition LLM.h:238

LLMLocal::get_next_available_slot
virtual int get_next_available_slot()=0
Get an available processing slot.

LLMLocal::save_slot
virtual std::string save_slot(int id_slot, const std::string &filepath)
Save slot state to file.
Definition LLM.h:232

LLMLocal::cancel
virtual void cancel(int id_slot)=0
Cancel request.

LLMProviderRegistry
Registry for managing LLM provider instances.
Definition LLM.h:380

LLMProviderRegistry::unregister_instance
void unregister_instance(LLMProvider *service)
Unregister an LLM provider instance.
Definition LLM.h:417

LLMProviderRegistry::get_instances
std::vector< LLMProvider * > get_instances()
Get all registered provider instances.
Definition LLM.h:426

LLMProviderRegistry::set_debug_level
void set_debug_level(int level)
Set global debug level.
Definition LLM.h:434

LLMProviderRegistry::get_debug_level
const int get_debug_level()
Get current debug level.
Definition LLM.h:441

LLMProviderRegistry::initialised
static bool initialised
Whether the registry has been initialized.
Definition LLM.h:382

LLMProviderRegistry::set_log_callback
void set_log_callback(CharArrayFn callback)
Set global log callback.
Definition LLM.h:448

LLMProviderRegistry::get_log_callback
const CharArrayFn get_log_callback()
Get current log callback.
Definition LLM.h:455

LLMProviderRegistry::register_instance
void register_instance(LLMProvider *service)
Register an LLM provider instance.
Definition LLM.h:408

LLMProviderRegistry::instance
static LLMProviderRegistry & instance()
Get the singleton registry instance.
Definition LLM.h:395

LLMProviderRegistry::inject_registry
static void inject_registry(LLMProviderRegistry *instance)
Inject a custom registry instance.
Definition LLM.h:387

LLMProvider
Abstract class for LLM service providers.
Definition LLM.h:275

LLMProvider::logging_callback
virtual void logging_callback(CharArrayFn callback)=0
Set logging callback function.

LLMProvider::started
virtual bool started()=0
Check if service is started.

LLMProvider::start_server
virtual void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="")=0
Start HTTP server.

LLMProvider::debug_implementation
virtual std::string debug_implementation()=0
Implementation debugging.

LLMProvider::join_service
virtual void join_service()=0
Wait for service thread to complete.

LLMProvider::stop_server
virtual void stop_server()=0
Stop HTTP server.

LLMProvider::debug
virtual void debug(int debug_level)=0
Set debug level.

LLMProvider::join_server
virtual void join_server()=0
Wait for server thread to complete.

LLMProvider::enable_reasoning
virtual void enable_reasoning(bool reasoning)
enable reasoning
Definition LLM.h:301

LLMProvider::stop
virtual void stop()=0
Stop the LLM service.

LLMProvider::set_SSL
virtual void set_SSL(const std::string &SSL_cert, const std::string &SSL_key)=0
Configure SSL certificates.

LLMProvider::lora_list_json
virtual std::string lora_list_json()=0
List available LoRA adapters.

LLMProvider::lora_weight_json
virtual std::string lora_weight_json(const json &data)=0
Configure LoRA weights with HTTP response support.

LLMProvider::start
virtual void start()=0
Start the LLM service.

LLMProvider::embedding_size
virtual int embedding_size()=0
Get embedding vector size.

LLM
Abstract base class for Large Language Model operations.
Definition LLM.h:60

LLM::embeddings_json
virtual std::string embeddings_json(const json &data)=0
Generate embeddings with HTTP response support.

LLM::get_completion_params
virtual std::string get_completion_params()
Get current completion parameters.
Definition LLM.h:109

LLM::apply_template_json
virtual std::string apply_template_json(const json &data)=0
Apply a chat template to message data.

LLM::tokenize_json
virtual std::string tokenize_json(const json &data)=0
Tokenize input (override)

LLM::set_completion_params
virtual void set_completion_params(json completion_params_)
Set completion parameters.
Definition LLM.h:105

LLM::completion_params
json completion_params
JSON object containing completion parameters.
Definition LLM.h:64

LLM::set_grammar
virtual void set_grammar(std::string grammar_)
Set grammar for constrained generation.
Definition LLM.h:130

LLM::~LLM
virtual ~LLM()=default
Virtual destructor.

LLM::completion_json
virtual std::string completion_json(const json &data, CharArrayFn callback, bool callbackWithJSON)=0
Generate text completion.

LLM::detokenize_json
virtual std::string detokenize_json(const json &data)=0
Convert tokens back to text.

LLM::get_grammar
virtual std::string get_grammar()
Get current grammar specification.
Definition LLM.h:134

defs.h
File with basic definitions.

LLM_Lora_List
const char * LLM_Lora_List(LLMProvider *llm)
List LoRA adapters (C API)
Definition LLM.cpp:530

LLM_Get_Grammar
const char * LLM_Get_Grammar(LLM *llm)
Get grammar (C API)
Definition LLM.cpp:481

LLM_Stop
void LLM_Stop(LLMProvider *llm)
Stop LLM service (C API)
Definition LLM.cpp:580

LLM_Enable_Reasoning
void LLM_Enable_Reasoning(LLMProvider *llm, bool enable_reasoning)
Enable reasoning (C API)
Definition LLM.cpp:491

LLM_Save_Slot
const char * LLM_Save_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Save slot state (C API)
Definition LLM.cpp:496

LLM_Logging_Callback
void LLM_Logging_Callback(CharArrayFn callback)
Set global logging callback (C API)
Definition LLM.cpp:413

LLM_Load_Slot
const char * LLM_Load_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Load slot state (C API)
Definition LLM.cpp:501

LLM_Join_Service
void LLM_Join_Service(LLMProvider *llm)
Wait for service to complete (C API)
Definition LLM.cpp:560

LLM_Set_SSL
void LLM_Set_SSL(LLMProvider *llm, const char *SSL_cert, const char *SSL_key)
Set SSL configuration (C API)
Definition LLM.cpp:585

Has_GPU_Layers
bool Has_GPU_Layers(const char *command)
Check if command has GPU layers (C API)
Definition LLM.cpp:398

LLM_Status_Message
const char * LLM_Status_Message()
Get last operation status message (C API)
Definition LLM.cpp:595

LLM_Set_Grammar
void LLM_Set_Grammar(LLM *llm, const char *grammar="")
Set grammar (C API)
Definition LLM.cpp:476

LLM_Apply_Template
const char * LLM_Apply_Template(LLM *llm, const char *messages_as_json)
Apply chat template (C API)
Definition LLM.cpp:486

LLM_Cancel
void LLM_Cancel(LLMLocal *llm, int id_slot)
Cancel request (C API)
Definition LLM.cpp:506

LLM_Logging_Stop
void LLM_Logging_Stop()
Stop global logging (C API)
Definition LLM.cpp:423

LLM_Start
void LLM_Start(LLMProvider *llm)
Start LLM service (C API)
Definition LLM.cpp:570

LLM_Set_Completion_Parameters
void LLM_Set_Completion_Parameters(LLM *llm, const char *params_json="{}")
Set completion parameters (C API)
Definition LLM.cpp:465

LLM_Embedding_Size
const int LLM_Embedding_Size(LLMProvider *llm)
Get embedding vector size (C API)
Definition LLM.cpp:601

LLM_Delete
void LLM_Delete(LLMProvider *llm)
Delete LLM provider (C API)
Definition LLM.cpp:542

LLM_Debug
void LLM_Debug(int debug_level)
Set global debug level (C API)
Definition LLM.cpp:403

LLM_Lora_Weight
bool LLM_Lora_Weight(LLMProvider *llm, const char *loras_as_json)
Configure LoRA weights (C API)
Definition LLM.cpp:511

LLM_Completion
const char * LLM_Completion(LLM *llm, const char *prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false)
Generate completion (C API)
Definition LLM.cpp:460

LLM_Tokenize
const char * LLM_Tokenize(LLM *llm, const char *query)
Tokenize text (C API)
Definition LLM.cpp:443

LLM_Join_Server
void LLM_Join_Server(LLMProvider *llm)
Wait for server to complete (C API)
Definition LLM.cpp:565

LLM_Started
const bool LLM_Started(LLMProvider *llm)
Check if service is started (C API)
Definition LLM.cpp:575

LLM_Get_Completion_Parameters
const char * LLM_Get_Completion_Parameters(LLM *llm)
Get completion parameters (C API)
Definition LLM.cpp:471

LLM_Embeddings
const char * LLM_Embeddings(LLM *llm, const char *query)
Generate embeddings (C API)
Definition LLM.cpp:454

LLM_Status_Code
const int LLM_Status_Code()
Get last operation status code (C API)
Definition LLM.cpp:590

LLM_Stop_Server
void LLM_Stop_Server(LLMProvider *llm)
Stop HTTP server (C API)
Definition LLM.cpp:555

LLM_Detokenize
const char * LLM_Detokenize(LLM *llm, const char *tokens_as_json)
Detokenize tokens (C API)
Definition LLM.cpp:449

LLM_Start_Server
void LLM_Start_Server(LLMProvider *llm, const char *host="0.0.0.0", int port=-1, const char *API_key="")
Start HTTP server (C API)
Definition LLM.cpp:550

LoraIdScalePath
Structure representing a LoRA adapter with ID, scale, and file path.
Definition LLM.h:36

LoraIdScalePath::operator==
bool operator==(const LoraIdScalePath &other) const
Equality comparison operator.
Definition LLM.h:44

LoraIdScalePath::path
std::string path
Filesystem path to the LoRA adapter file.
Definition LLM.h:39

LoraIdScalePath::id
int id
Unique identifier for the LoRA adapter.
Definition LLM.h:37

LoraIdScalePath::scale
float scale
Scale factor for the LoRA adapter.
Definition LLM.h:38

LoraIdScale
Structure representing a LoRA adapter with ID and scale.
Definition LLM.h:20

LoraIdScale::operator==
bool operator==(const LoraIdScale &other) const
Equality comparison operator.
Definition LLM.h:27

LoraIdScale::scale
float scale
Scale factor for the LoRA adapter (typically 0.0 to 1.0)
Definition LLM.h:22

LoraIdScale::id
int id
Unique identifier for the LoRA adapter.
Definition LLM.h:21