LlamaLib  v2.0.2
Cross-platform library for local LLMs
Loading...
Searching...
No Matches
LLM.h
Go to the documentation of this file.
1
6
7#pragma once
8
9#include "defs.h"
10#include "error_handling.h"
11#include <sstream>
12#if defined(__APPLE__)
13#include <TargetConditionals.h>
14#endif
15
16
20{
21 int id;
22 float scale;
23
27 bool operator==(const LoraIdScale &other) const
28 {
29 return id == other.id && scale == other.scale;
30 }
31};
32
36{
37 int id;
38 float scale;
39 std::string path;
40
44 bool operator==(const LoraIdScalePath &other) const
45 {
46 return id == other.id && scale == other.scale && path == other.path;
47 }
48};
49
54
59class UNDREAMAI_API LLM
60{
61public:
62 int32_t n_keep = 0;
63 std::string grammar = "";
65
67 virtual ~LLM() = default;
68
72 virtual std::vector<int> tokenize(const std::string &query);
73
77 virtual std::string tokenize_json(const json &data) = 0;
78
82 virtual std::string detokenize(const std::vector<int32_t> &tokens);
83
88 virtual std::string detokenize_json(const json &data) = 0;
89
93 virtual std::vector<float> embeddings(const std::string &query);
94
99 virtual std::string embeddings_json(const json &data) = 0;
100
104 // See https://github.com/ggml-org/llama.cpp/tree/master/tools/server#post-completion-given-a-prompt-it-returns-the-predicted-completion for the different parameters
105 virtual void set_completion_params(json completion_params_) { completion_params = completion_params_; }
106
109 virtual std::string get_completion_params() { return completion_params; }
110
117 virtual std::string completion(const std::string &prompt, CharArrayFn callback = nullptr, int id_slot = -1, bool return_response_json = false);
118
125 virtual std::string completion_json(const json &data, CharArrayFn callback, bool callbackWithJSON) = 0;
126
130 virtual void set_grammar(std::string grammar_) { grammar = grammar_; }
131
134 virtual std::string get_grammar() { return grammar; }
135
139 virtual std::string apply_template(const json &messages);
140
145 virtual std::string apply_template_json(const json &data) = 0;
146
150 static bool has_gpu_layers(const std::string &command);
151
163 static std::string LLM_args_to_command(const std::string &model_path, int num_slots = 1, int num_threads = -1, int num_GPU_layers = 0, bool flash_attention = false, int context_size = 4096, int batch_size = 2048, bool embedding_only = false, const std::vector<std::string> &lora_paths = {});
164
165protected:
169 virtual json build_apply_template_json(const json &messages);
170
174 virtual std::string parse_apply_template_json(const json &result);
175
179 virtual json build_tokenize_json(const std::string &query);
180
184 virtual std::vector<int> parse_tokenize_json(const json &result);
185
189 virtual json build_detokenize_json(const std::vector<int32_t> &tokens);
190
194 virtual std::string parse_detokenize_json(const json &result);
195
199 virtual json build_embeddings_json(const std::string &query);
200
204 virtual std::vector<float> parse_embeddings_json(const json &result);
205
210 virtual json build_completion_json(const std::string &prompt, int id_slot = -1);
211
215 virtual std::string parse_completion_json(const json &result);
216};
217
221class UNDREAMAI_API LLMLocal : public LLM
222{
223public:
226 virtual int get_next_available_slot() = 0;
227
232 virtual std::string save_slot(int id_slot, const std::string &filepath) { return slot(id_slot, "save", filepath); }
233
238 virtual std::string load_slot(int id_slot, const std::string &filepath) { return slot(id_slot, "restore", filepath); }
239
242 virtual void cancel(int id_slot) = 0;
243
248 virtual std::string slot_json(const json &data) = 0;
249
250protected:
256 virtual std::string slot(int id_slot, const std::string &action, const std::string &filepath);
257
263 virtual json build_slot_json(int id_slot, const std::string &action, const std::string &filepath);
264
268 virtual std::string parse_slot_json(const json &result);
269};
270
274class UNDREAMAI_API LLMProvider : public LLMLocal
275{
276public:
278 virtual ~LLMProvider();
279
283 virtual bool lora_weight(const std::vector<LoraIdScale> &loras);
284
289 virtual std::string lora_weight_json(const json &data) = 0;
290
293 virtual std::vector<LoraIdScalePath> lora_list();
294
297 virtual std::string lora_list_json() = 0;
298
301 virtual void enable_reasoning(bool reasoning) { reasoning_enabled = reasoning; }
302
305 virtual void debug(int debug_level) = 0;
306
309 virtual void logging_callback(CharArrayFn callback) = 0;
310
312 virtual void logging_stop();
313
315 virtual void start() = 0;
316
319 virtual bool started() = 0;
320
322 virtual void stop() = 0;
323
328 virtual void start_server(const std::string &host = "0.0.0.0", int port = -1, const std::string &API_key = "") = 0;
329
331 virtual void stop_server() = 0;
332
334 virtual void join_service() = 0;
335
337 virtual void join_server() = 0;
338
342 virtual void set_SSL(const std::string &SSL_cert, const std::string &SSL_key) = 0;
343
346 virtual int embedding_size() = 0;
347
350 virtual std::string debug_implementation() = 0;
351
352protected:
353 bool reasoning_enabled = false;
354
358 virtual bool parse_lora_weight_json(const json &result);
359
363 virtual json build_lora_weight_json(const std::vector<LoraIdScale> &loras);
364
368 virtual std::vector<LoraIdScalePath> parse_lora_list_json(const json &result);
369
373 virtual json build_lora_list_json(const std::vector<LoraIdScalePath> &loras);
374};
375
380{
381public:
382 static bool initialised;
383
388 {
389 custom_instance_ = instance;
390 initialised = true;
391 }
392
396 {
397 if (custom_instance_)
398 return *custom_instance_;
399
400 static LLMProviderRegistry registry;
401 initialised = true;
402 return registry;
403 }
404
409 {
410 std::lock_guard<std::mutex> lock(mutex_);
411 instances_.push_back(service);
412 }
413
418 {
419 std::lock_guard<std::mutex> lock(mutex_);
420 instances_.erase(std::remove(instances_.begin(), instances_.end(), service), instances_.end());
421 }
422
426 std::vector<LLMProvider *> get_instances()
427 {
428 std::lock_guard<std::mutex> lock(mutex_);
429 return instances_;
430 }
431
434 void set_debug_level(int level)
435 {
436 debug_level_ = level;
437 }
438
441 const int get_debug_level()
442 {
443 return debug_level_;
444 }
445
448 void set_log_callback(CharArrayFn callback)
449 {
450 log_callback_ = callback;
451 }
452
455 const CharArrayFn get_log_callback()
456 {
457 return log_callback_;
458 }
459
460private:
461 static LLMProviderRegistry *custom_instance_;
462
463 std::mutex mutex_;
464 std::vector<LLMProvider *> instances_;
465 int debug_level_ = 0;
466 CharArrayFn log_callback_ = nullptr;
467
469 LLMProviderRegistry() = default;
471 ~LLMProviderRegistry() = default;
475 LLMProviderRegistry &operator=(const LLMProviderRegistry &) = delete;
476};
477
480
481extern "C"
482{
486 UNDREAMAI_API bool Has_GPU_Layers(const char *command);
487
490 UNDREAMAI_API void LLM_Debug(int debug_level);
491
494 UNDREAMAI_API void LLM_Logging_Callback(CharArrayFn callback);
495
497 UNDREAMAI_API void LLM_Logging_Stop();
498
499#ifdef _DEBUG
502 UNDREAMAI_API const bool IsDebuggerAttached(void);
503#endif
504
508 UNDREAMAI_API void LLM_Set_Completion_Parameters(LLM *llm, const char *params_json = "{}");
509
513 UNDREAMAI_API const char *LLM_Get_Completion_Parameters(LLM *llm);
514
518 UNDREAMAI_API void LLM_Set_Grammar(LLM *llm, const char *grammar = "");
519
523 UNDREAMAI_API const char *LLM_Get_Grammar(LLM *llm);
524
529 UNDREAMAI_API const char *LLM_Apply_Template(LLM *llm, const char *messages_as_json);
530
535 UNDREAMAI_API const char *LLM_Tokenize(LLM *llm, const char *query);
536
541 UNDREAMAI_API const char *LLM_Detokenize(LLM *llm, const char *tokens_as_json);
542
547 UNDREAMAI_API const char *LLM_Embeddings(LLM *llm, const char *query);
548
556 UNDREAMAI_API const char *LLM_Completion(LLM *llm, const char *prompt, CharArrayFn callback = nullptr, int id_slot = -1, bool return_response_json = false);
557
563 UNDREAMAI_API const char *LLM_Save_Slot(LLMLocal *llm, int id_slot, const char *filepath);
564
570 UNDREAMAI_API const char *LLM_Load_Slot(LLMLocal *llm, int id_slot, const char *filepath);
571
575 UNDREAMAI_API void LLM_Cancel(LLMLocal *llm, int id_slot);
576
581 UNDREAMAI_API bool LLM_Lora_Weight(LLMProvider *llm, const char *loras_as_json);
582
586 UNDREAMAI_API void LLM_Enable_Reasoning(LLMProvider *llm, bool enable_reasoning);
587
591 UNDREAMAI_API const char *LLM_Lora_List(LLMProvider *llm);
592
595 UNDREAMAI_API void LLM_Delete(LLMProvider *llm);
596
599 UNDREAMAI_API void LLM_Start(LLMProvider *llm);
600
604 UNDREAMAI_API const bool LLM_Started(LLMProvider *llm);
605
608 UNDREAMAI_API void LLM_Stop(LLMProvider *llm);
609
615 UNDREAMAI_API void LLM_Start_Server(LLMProvider *llm, const char *host = "0.0.0.0", int port = -1, const char *API_key = "");
616
619 UNDREAMAI_API void LLM_Stop_Server(LLMProvider *llm);
620
623 UNDREAMAI_API void LLM_Join_Service(LLMProvider *llm);
624
627 UNDREAMAI_API void LLM_Join_Server(LLMProvider *llm);
628
633 UNDREAMAI_API void LLM_Set_SSL(LLMProvider *llm, const char *SSL_cert, const char *SSL_key);
634
637 UNDREAMAI_API const int LLM_Status_Code();
638
641 UNDREAMAI_API const char *LLM_Status_Message();
642
646 UNDREAMAI_API const int LLM_Embedding_Size(LLMProvider *llm);
647}
648
649
void ensure_error_handlers_initialized()
Ensures error handlers are properly initialized.
Definition LLM.cpp:25
Abstract class for local LLM operations with slot management.
Definition LLM.h:222
virtual std::string slot_json(const json &data)=0
Manage slots with HTTP response support.
virtual std::string load_slot(int id_slot, const std::string &filepath)
Load slot state from file.
Definition LLM.h:238
virtual int get_next_available_slot()=0
Get an available processing slot.
virtual std::string save_slot(int id_slot, const std::string &filepath)
Save slot state to file.
Definition LLM.h:232
virtual void cancel(int id_slot)=0
Cancel request.
Registry for managing LLM provider instances.
Definition LLM.h:380
void unregister_instance(LLMProvider *service)
Unregister an LLM provider instance.
Definition LLM.h:417
std::vector< LLMProvider * > get_instances()
Get all registered provider instances.
Definition LLM.h:426
void set_debug_level(int level)
Set global debug level.
Definition LLM.h:434
const int get_debug_level()
Get current debug level.
Definition LLM.h:441
static bool initialised
Whether the registry has been initialized.
Definition LLM.h:382
void set_log_callback(CharArrayFn callback)
Set global log callback.
Definition LLM.h:448
const CharArrayFn get_log_callback()
Get current log callback.
Definition LLM.h:455
void register_instance(LLMProvider *service)
Register an LLM provider instance.
Definition LLM.h:408
static LLMProviderRegistry & instance()
Get the singleton registry instance.
Definition LLM.h:395
static void inject_registry(LLMProviderRegistry *instance)
Inject a custom registry instance.
Definition LLM.h:387
Abstract class for LLM service providers.
Definition LLM.h:275
virtual void logging_callback(CharArrayFn callback)=0
Set logging callback function.
virtual bool started()=0
Check if service is started.
virtual void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="")=0
Start HTTP server.
virtual std::string debug_implementation()=0
Implementation debugging.
virtual void join_service()=0
Wait for service thread to complete.
virtual void stop_server()=0
Stop HTTP server.
virtual void debug(int debug_level)=0
Set debug level.
virtual void join_server()=0
Wait for server thread to complete.
virtual void enable_reasoning(bool reasoning)
enable reasoning
Definition LLM.h:301
virtual void stop()=0
Stop the LLM service.
virtual void set_SSL(const std::string &SSL_cert, const std::string &SSL_key)=0
Configure SSL certificates.
virtual std::string lora_list_json()=0
List available LoRA adapters.
virtual std::string lora_weight_json(const json &data)=0
Configure LoRA weights with HTTP response support.
virtual void start()=0
Start the LLM service.
virtual int embedding_size()=0
Get embedding vector size.
Abstract base class for Large Language Model operations.
Definition LLM.h:60
virtual std::string embeddings_json(const json &data)=0
Generate embeddings with HTTP response support.
virtual std::string get_completion_params()
Get current completion parameters.
Definition LLM.h:109
virtual std::string apply_template_json(const json &data)=0
Apply a chat template to message data.
virtual std::string tokenize_json(const json &data)=0
Tokenize input (override)
virtual void set_completion_params(json completion_params_)
Set completion parameters.
Definition LLM.h:105
json completion_params
JSON object containing completion parameters.
Definition LLM.h:64
virtual void set_grammar(std::string grammar_)
Set grammar for constrained generation.
Definition LLM.h:130
virtual ~LLM()=default
Virtual destructor.
virtual std::string completion_json(const json &data, CharArrayFn callback, bool callbackWithJSON)=0
Generate text completion.
virtual std::string detokenize_json(const json &data)=0
Convert tokens back to text.
virtual std::string get_grammar()
Get current grammar specification.
Definition LLM.h:134
File with basic definitions.
const char * LLM_Lora_List(LLMProvider *llm)
List LoRA adapters (C API)
Definition LLM.cpp:530
const char * LLM_Get_Grammar(LLM *llm)
Get grammar (C API)
Definition LLM.cpp:481
void LLM_Stop(LLMProvider *llm)
Stop LLM service (C API)
Definition LLM.cpp:580
void LLM_Enable_Reasoning(LLMProvider *llm, bool enable_reasoning)
Enable reasoning (C API)
Definition LLM.cpp:491
const char * LLM_Save_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Save slot state (C API)
Definition LLM.cpp:496
void LLM_Logging_Callback(CharArrayFn callback)
Set global logging callback (C API)
Definition LLM.cpp:413
const char * LLM_Load_Slot(LLMLocal *llm, int id_slot, const char *filepath)
Load slot state (C API)
Definition LLM.cpp:501
void LLM_Join_Service(LLMProvider *llm)
Wait for service to complete (C API)
Definition LLM.cpp:560
void LLM_Set_SSL(LLMProvider *llm, const char *SSL_cert, const char *SSL_key)
Set SSL configuration (C API)
Definition LLM.cpp:585
bool Has_GPU_Layers(const char *command)
Check if command has GPU layers (C API)
Definition LLM.cpp:398
const char * LLM_Status_Message()
Get last operation status message (C API)
Definition LLM.cpp:595
void LLM_Set_Grammar(LLM *llm, const char *grammar="")
Set grammar (C API)
Definition LLM.cpp:476
const char * LLM_Apply_Template(LLM *llm, const char *messages_as_json)
Apply chat template (C API)
Definition LLM.cpp:486
void LLM_Cancel(LLMLocal *llm, int id_slot)
Cancel request (C API)
Definition LLM.cpp:506
void LLM_Logging_Stop()
Stop global logging (C API)
Definition LLM.cpp:423
void LLM_Start(LLMProvider *llm)
Start LLM service (C API)
Definition LLM.cpp:570
void LLM_Set_Completion_Parameters(LLM *llm, const char *params_json="{}")
Set completion parameters (C API)
Definition LLM.cpp:465
const int LLM_Embedding_Size(LLMProvider *llm)
Get embedding vector size (C API)
Definition LLM.cpp:601
void LLM_Delete(LLMProvider *llm)
Delete LLM provider (C API)
Definition LLM.cpp:542
void LLM_Debug(int debug_level)
Set global debug level (C API)
Definition LLM.cpp:403
bool LLM_Lora_Weight(LLMProvider *llm, const char *loras_as_json)
Configure LoRA weights (C API)
Definition LLM.cpp:511
const char * LLM_Completion(LLM *llm, const char *prompt, CharArrayFn callback=nullptr, int id_slot=-1, bool return_response_json=false)
Generate completion (C API)
Definition LLM.cpp:460
const char * LLM_Tokenize(LLM *llm, const char *query)
Tokenize text (C API)
Definition LLM.cpp:443
void LLM_Join_Server(LLMProvider *llm)
Wait for server to complete (C API)
Definition LLM.cpp:565
const bool LLM_Started(LLMProvider *llm)
Check if service is started (C API)
Definition LLM.cpp:575
const char * LLM_Get_Completion_Parameters(LLM *llm)
Get completion parameters (C API)
Definition LLM.cpp:471
const char * LLM_Embeddings(LLM *llm, const char *query)
Generate embeddings (C API)
Definition LLM.cpp:454
const int LLM_Status_Code()
Get last operation status code (C API)
Definition LLM.cpp:590
void LLM_Stop_Server(LLMProvider *llm)
Stop HTTP server (C API)
Definition LLM.cpp:555
const char * LLM_Detokenize(LLM *llm, const char *tokens_as_json)
Detokenize tokens (C API)
Definition LLM.cpp:449
void LLM_Start_Server(LLMProvider *llm, const char *host="0.0.0.0", int port=-1, const char *API_key="")
Start HTTP server (C API)
Definition LLM.cpp:550
Structure representing a LoRA adapter with ID, scale, and file path.
Definition LLM.h:36
bool operator==(const LoraIdScalePath &other) const
Equality comparison operator.
Definition LLM.h:44
std::string path
Filesystem path to the LoRA adapter file.
Definition LLM.h:39
int id
Unique identifier for the LoRA adapter.
Definition LLM.h:37
float scale
Scale factor for the LoRA adapter.
Definition LLM.h:38
Structure representing a LoRA adapter with ID and scale.
Definition LLM.h:20
bool operator==(const LoraIdScale &other) const
Equality comparison operator.
Definition LLM.h:27
float scale
Scale factor for the LoRA adapter (typically 0.0 to 1.0)
Definition LLM.h:22
int id
Unique identifier for the LoRA adapter.
Definition LLM.h:21