10#include <condition_variable>
13#include "completion_processor.h"
16#define LLAMALIB_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, -1, __VA_ARGS__)
20struct server_http_context;
22struct server_http_req;
23struct server_http_res;
25using server_http_res_ptr = std::unique_ptr<server_http_res>;
26using handler_t = std::function<server_http_res_ptr(
const server_http_req & req)>;
48 LLMService(
const std::string &model_path,
int num_slots = 1,
int num_threads = -1,
int num_GPU_layers = 0,
bool flash_attention =
false,
int context_size = 4096,
int batch_size = 2048,
bool embedding_only =
false,
const std::vector<std::string> &lora_paths = {});
58 static LLMService *from_params(
const json ¶ms_json);
78 static std::vector<char *> jsonToArguments(
const json ¶ms_json);
84 void init(
int argc,
char **argv);
89 void init(
const std::string ¶ms_string);
94 void init(
const char *params_string);
99 std::string encapsulate_route(
const json &body, handler_t route_handler);
133 std::string
completion_json(
const json &data, CharArrayFn callback =
nullptr,
bool callbackWithJSON =
true)
override;
169 void start_server(
const std::string &host =
"0.0.0.0",
int port = -1,
const std::string &API_key =
"")
override;
183 void set_SSL(
const std::string &SSL_cert,
const std::string &SSL_key)
override;
195 void debug(
int debug_level)
override;
205 std::string command =
"";
206 common_params *params;
207 bool llama_backend_has_init;
208 server_context *ctx_server =
nullptr;
209 server_http_context* ctx_http =
nullptr;
210 server_routes* routes =
nullptr;
212 std::mutex start_stop_mutex;
213 std::thread service_thread;
214 std::condition_variable service_stopped_cv;
215 bool service_stopped =
false;
216 std::thread server_thread;
217 std::condition_variable server_stopped_cv;
218 bool server_stopped =
false;
220 int next_available_slot = 0;
226 std::vector<std::string> splitArguments(
const std::string &inputString);
231 const std::string detect_chat_template();
236 server_http_req escape_reasoning(server_http_req prompt);
261 UNDREAMAI_API
LLMService *
LLMService_Construct(
const char *model_path,
int num_slots = 1,
int num_threads = -1,
int num_GPU_layers = 0,
bool flash_attention =
false,
int context_size = 4096,
int batch_size = 2048,
bool embedding_only =
false,
int lora_count = 0,
const char **lora_paths =
nullptr);
273 UNDREAMAI_API
void LLMService_InjectErrorState(
ErrorState *error_state);
Core LLM functionality interface and base classes.
Registry for managing LLM provider instances.
Abstract class for LLM service providers.
virtual void enable_reasoning(bool reasoning)
enable reasoning
Runtime loader for LLM libraries.
void stop_server() override
Stop HTTP server (override)
static LLMService * from_command(int argc, char **argv)
Create LLMService from argc/argv.
std::string debug_implementation() override
Implementation debugging.
std::string lora_weight_json(const json &data) override
Configure LoRA weights with HTTP response support.
void join_service() override
Wait for service thread completion (override)
void cancel(int id_slot) override
Cancel running request (override)
bool started() override
Check service status (override)
void start() override
Start the LLM service (override)
std::string lora_list_json() override
List available LoRA adapters.
void logging_callback(CharArrayFn callback) override
Set logging callback (override)
void set_SSL(const std::string &SSL_cert, const std::string &SSL_key) override
Configure SSL certificates (override)
std::string tokenize_json(const json &data) override
Tokenize input (override)
LLMService(const std::string &model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, const std::vector< std::string > &lora_paths={})
Parameterized constructor.
LLMService()
Default constructor.
std::string slot_json(const json &data) override
Manage slots with HTTP response support.
std::string detokenize_json(const json &data) override
Convert tokens back to text.
std::string embeddings_json(const json &data) override
Generate embeddings with HTTP response support.
int get_next_available_slot() override
Get available processing slot (override)
void debug(int debug_level) override
Set debug level (override)
void join_server() override
Wait for server thread completion (override)
std::string apply_template_json(const json &data) override
Apply a chat template to message data.
void start_server(const std::string &host="0.0.0.0", int port=-1, const std::string &API_key="") override
Start HTTP server (override)
std::string completion_json(const json &data, CharArrayFn callback=nullptr, bool callbackWithJSON=true) override
Generate completion (override)
std::string get_command()
Returns the construct command.
static LLMService * from_command(const std::string &command)
Create LLMService from command line string.
void stop() override
Stop the LLM service (override)
int embedding_size() override
Get embedding vector dimensions (override)
LLMService * LLMService_Construct(const char *model_path, int num_slots=1, int num_threads=-1, int num_GPU_layers=0, bool flash_attention=false, int context_size=4096, int batch_size=2048, bool embedding_only=false, int lora_count=0, const char **lora_paths=nullptr)
Construct LLMService instance (C API)
void LLMService_Registry(LLMProviderRegistry *existing_instance)
Set registry for LLMService (C API)
const char * LLMService_Command(LLMService *llm_service)
Returns the construct command (C API)
LLMService * LLMService_From_Command(const char *params_string)
Create LLMService from command string (C API)
Error state container for sharing between libraries.