LlamaLib  v2.0.2
Cross-platform library for local LLMs
Loading...
Searching...
No Matches
LLM_service.h
Go to the documentation of this file.
1
6
7#pragma once
8
9#include <thread>
10#include <condition_variable>
11
12#include "LLM.h"
13#include "completion_processor.h"
14
16#define LLAMALIB_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, -1, __VA_ARGS__)
17
18struct common_params;
19struct server_context;
20struct server_http_context;
21struct server_routes;
22struct server_http_req;
23struct server_http_res;
24
25using server_http_res_ptr = std::unique_ptr<server_http_res>;
26using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
27
31class UNDREAMAI_API LLMService : public LLMProvider
32{
33public:
37
48 LLMService(const std::string &model_path, int num_slots = 1, int num_threads = -1, int num_GPU_layers = 0, bool flash_attention = false, int context_size = 4096, int batch_size = 2048, bool embedding_only = false, const std::vector<std::string> &lora_paths = {});
49
52
58 static LLMService *from_params(const json &params_json);
59
65 static LLMService *from_command(const std::string &command);
66
72 static LLMService *from_command(int argc, char **argv);
73
78 static std::vector<char *> jsonToArguments(const json &params_json);
79
84 void init(int argc, char **argv);
85
89 void init(const std::string &params_string);
90
94 void init(const char *params_string);
95
97 std::string get_command() { return command; }
98
99 std::string encapsulate_route(const json &body, handler_t route_handler);
100
101 //=================================== LLM METHODS START ===================================//
102
103 void enable_reasoning(bool reasoning) override;
104
108 std::string tokenize_json(const json &data) override;
109
114 std::string detokenize_json(const json &data) override;
115
120 std::string embeddings_json(const json &data) override;
121
126 std::string apply_template_json(const json &data) override;
127
133 std::string completion_json(const json &data, CharArrayFn callback = nullptr, bool callbackWithJSON = true) override;
134
139 std::string slot_json(const json &data) override;
140
145 std::string lora_weight_json(const json &data) override;
146
149 std::string lora_list_json() override;
150
153 void cancel(int id_slot) override;
154
156 void start() override;
157
160 bool started() override;
161
163 void stop() override;
164
169 void start_server(const std::string &host = "0.0.0.0", int port = -1, const std::string &API_key = "") override;
170
172 void stop_server() override;
173
175 void join_service() override;
176
178 void join_server() override;
179
183 void set_SSL(const std::string &SSL_cert, const std::string &SSL_key) override;
184
187 int embedding_size() override;
188
192
195 void debug(int debug_level) override;
196
199 void logging_callback(CharArrayFn callback) override;
200
201 std::string debug_implementation() override { return "standalone"; }
202 //=================================== LLM METHODS END ===================================//
203
204private:
205 std::string command = "";
206 common_params *params;
207 bool llama_backend_has_init;
208 server_context *ctx_server = nullptr;
209 server_http_context* ctx_http = nullptr;
210 server_routes* routes = nullptr;
211
212 std::mutex start_stop_mutex;
213 std::thread service_thread;
214 std::condition_variable service_stopped_cv;
215 bool service_stopped = false;
216 std::thread server_thread;
217 std::condition_variable server_stopped_cv;
218 bool server_stopped = false;
219
220 int next_available_slot = 0;
221
226 std::vector<std::string> splitArguments(const std::string &inputString);
227
231 const std::string detect_chat_template();
232
236 server_http_req escape_reasoning(server_http_req prompt);
237};
238
241
242extern "C"
243{
247 UNDREAMAI_API void LLMService_Registry(LLMProviderRegistry *existing_instance);
248
261 UNDREAMAI_API LLMService *LLMService_Construct(const char *model_path, int num_slots = 1, int num_threads = -1, int num_GPU_layers = 0, bool flash_attention = false, int context_size = 4096, int batch_size = 2048, bool embedding_only = false, int lora_count = 0, const char **lora_paths = nullptr);
262
267 UNDREAMAI_API LLMService *LLMService_From_Command(const char *params_string);
268
271 UNDREAMAI_API const char *LLMService_Command(LLMService *llm_service);
272
273 UNDREAMAI_API void LLMService_InjectErrorState(ErrorState *error_state);
274}
275
276
Core LLM functionality interface and base classes.
Registry for managing LLM provider instances.
Definition LLM.h:380
Abstract class for LLM service providers.
Definition LLM.h:275
virtual void enable_reasoning(bool reasoning)
enable reasoning
Definition LLM.h:301
Runtime loader for LLM libraries.
Definition LLM_runtime.h:63
void stop_server() override
Stop HTTP server (override)
static LLMService * from_command(int argc, char **argv)
Create LLMService from argc/argv.
~LLMService()
Destructor.
std::string debug_implementation() override
Implementation debugging.
std::string lora_weight_json(const json &data) override
Configure LoRA weights with HTTP response support.
void join_service() override
Wait for service thread completion (override)
void cancel(int id_slot) override
Cancel running request (override)
bool started() override
Check service status (override)
void start() override
Start the LLM service (override)
std::string lora_list_json() override
List available LoRA adapters.
void logging_callback(CharArrayFn callback) override
Set logging callback (override)
void set_SSL(const std::string &SSL_cert, const std::string &SSL_key) override
Configure SSL certificates (override)
std::string tokenize_json(const json &data) override
Tokenize input (override)
LLMService(const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={})
Parameterized constructor.
LLMService()
Default constructor.
std::string slot_json(const json &data) override
Manage slots with HTTP response support.
std::string detokenize_json(const json &data) override
Convert tokens back to text.
std::string embeddings_json(const json &data) override
Generate embeddings with HTTP response support.
int get_next_available_slot() override
Get available processing slot (override)
void debug(int debug_level) override
Set debug level (override)
void join_server() override
Wait for server thread completion (override)
std::string apply_template_json(const json &data) override
Apply a chat template to message data.
void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="") override
Start HTTP server (override)
std::string completion_json(const json &data, CharArrayFn callback=nullptr, bool callbackWithJSON=true) override
Generate completion (override)
std::string get_command()
Returns the construct command.
Definition LLM_service.h:97
static LLMService * from_command(const std::string &command)
Create LLMService from command line string.
void stop() override
Stop the LLM service (override)
int embedding_size() override
Get embedding vector dimensions (override)
LLMService * LLMService_Construct(const char *model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, int lora_count=0, const char **lora_paths=nullptr)
Construct LLMService instance (C API)
void LLMService_Registry(LLMProviderRegistry *existing_instance)
Set registry for LLMService (C API)
const char * LLMService_Command(LLMService *llm_service)
Returns the construct command (C API)
LLMService * LLMService_From_Command(const char *params_string)
Create LLMService from command string (C API)
Error state container for sharing between libraries.