Documentation
¶
Overview ¶
Package llamacpp implements a client for the llama-server native API, not the OpenAI compatible one.
It is described at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#api-endpoints
The implementation is at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/server.cpp
Index ¶
- func ProcessCompletionStream(chunks iter.Seq[CompletionStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))
- func ProcessStream(chunks iter.Seq[ChatStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))
- func Scoreboard() scoreboard.Score
- type ChatRequest
- type ChatResponse
- type ChatStreamChunkResponse
- type Client
- func (c *Client) Completion(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)
- func (c *Client) CompletionRaw(ctx context.Context, in *CompletionRequest, out *CompletionResponse) error
- func (c *Client) CompletionStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))
- func (c *Client) CompletionStreamRaw(ctx context.Context, in *CompletionRequest) (iter.Seq[CompletionStreamChunkResponse], func() error)
- func (c *Client) GenStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))
- func (c *Client) GenStreamRaw(ctx context.Context, in *ChatRequest) (iter.Seq[ChatStreamChunkResponse], func() error)
- func (c *Client) GenSync(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)
- func (c *Client) GenSyncRaw(ctx context.Context, in *ChatRequest, out *ChatResponse) error
- func (c *Client) GetHealth(ctx context.Context) (string, error)
- func (c *Client) GetHealthRaw(ctx context.Context) (HealthResponse, error)
- func (c *Client) GetMetrics(ctx context.Context, m *Metrics) error
- func (c *Client) HTTPClient() *http.Client
- func (c *Client) ListModels(ctx context.Context) ([]genai.Model, error)
- func (c *Client) ModelID() string
- func (c *Client) Name() string
- func (c *Client) OutputModalities() genai.Modalities
- func (c *Client) Ping(ctx context.Context) error
- func (c *Client) Scoreboard() scoreboard.Score
- type CompletionRequest
- type CompletionResponse
- type CompletionStreamChunkResponse
- type Content
- type Contents
- type ErrorResponse
- type FinishReason
- type HealthResponse
- type Logprobs
- type Lora
- type Message
- type Metrics
- type Model
- type ModelHF
- type ModelOpenAI
- type ModelsResponse
- type PromptEncoding
- type StopType
- type Timings
- type TokenPerformance
- type Tool
- type ToolCall
- type Usage
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ProcessCompletionStream ¶
func ProcessCompletionStream(chunks iter.Seq[CompletionStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))
ProcessCompletionStream converts the raw packets from the completion streaming API into Reply fragments.
Types ¶
type ChatRequest ¶
type ChatRequest struct {
Stream bool `json:"stream,omitzero"`
Model string `json:"model,omitzero"`
MaxTokens int64 `json:"max_tokens,omitzero"`
Messages []Message `json:"messages"`
ResponseFormat struct {
Type string `json:"type,omitzero"` // Default: "text"; "json_object", "json_schema"
JSONSchema struct {
Schema *jsonschema.Schema `json:"schema,omitzero"`
} `json:"json_schema,omitzero"`
} `json:"response_format,omitzero"`
Grammar string `json:"grammar,omitzero"`
TimingsPerToken bool `json:"timings_per_token,omitzero"`
Tools []Tool `json:"tools,omitzero"`
ToolChoice string `json:"tool_choice,omitzero"` // Default: "auto"; "none", "required"
Stop []string `json:"stop,omitzero"`
ParallelToolCalls bool `json:"parallel_tool_calls,omitzero"`
AddGenerationPrompt bool `json:"add_generation_prompt,omitzero"`
// ReasoningFormat struct{} `json:"reasoning_format,omitzero"`
// EnableThinking bool `json:"enable_thinking,omitzero"`
ChatTemplateKWArgs map[string]string `json:"chat_template_kwargs,omitzero"`
N int64 `json:"n,omitzero"` // Must be 1 anyway.
Logprobs bool `json:"logprobs,omitzero"`
TopLogprobs int64 `json:"top_logprobs,omitzero"` // Requires Logprobs:true
// Prompt string `json:"prompt"`
Temperature float64 `json:"temperature,omitzero"`
DynaTempRange float64 `json:"dynatemp_range,omitzero"`
DynaTempExponent float64 `json:"dynatemp_exponent,omitzero"`
TopK int64 `json:"top_k,omitzero"`
TopP float64 `json:"top_p,omitzero"`
MinP float64 `json:"min_p,omitzero"`
NPredict int64 `json:"n_predict,omitzero"` // Maximum number of tokens to predict
NIndent int64 `json:"n_indent,omitzero"`
NKeep int64 `json:"n_keep,omitzero"`
TypicalP float64 `json:"typical_p,omitzero"`
RepeatPenalty float64 `json:"repeat_penalty,omitzero"`
RepeatLastN int64 `json:"repeat_last_n,omitzero"`
PresencePenalty float64 `json:"presence_penalty,omitzero"`
FrequencyPenalty float64 `json:"frequency_penalty,omitzero"`
DryMultiplier float64 `json:"dry_multiplier,omitzero"`
DryBase float64 `json:"dry_base,omitzero"`
DryAllowedLength int64 `json:"dry_allowed_length,omitzero"`
DryPenaltyLastN int64 `json:"dry_penalty_last_n,omitzero"`
DrySequenceBreakers []string `json:"dry_sequence_breakers,omitzero"`
XTCProbability float64 `json:"xtc_probability,omitzero"`
XTCThreshold float64 `json:"xtc_threshold,omitzero"`
Mirostat int32 `json:"mirostat,omitzero"`
MirostatTau float64 `json:"mirostat_tau,omitzero"`
MirostatEta float64 `json:"mirostat_eta,omitzero"`
Seed int64 `json:"seed,omitzero"`
IgnoreEos bool `json:"ignore_eos,omitzero"`
LogitBias []any `json:"logit_bias,omitzero"`
Nprobs int64 `json:"n_probs,omitzero"`
MinKeep int64 `json:"min_keep,omitzero"`
TMaxPredictMS int64 `json:"t_max_predict_ms,omitzero"`
ImageData []any `json:"image_data,omitzero"`
IDSlot int64 `json:"id_slot,omitzero"`
CachePrompt bool `json:"cache_prompt,omitzero"`
ReturnTokens bool `json:"return_tokens,omitzero"`
Samplers []string `json:"samplers,omitzero"`
PostSamplingProbs bool `json:"post_sampling_probs,omitzero"`
ResponseFields []string `json:"response_fields,omitzero"`
Lora []Lora `json:"lora,omitzero"`
}
ChatRequest is not documented.
Better take a look at oaicompat_chat_params_parse() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/utils.hpp
func (*ChatRequest) Init ¶
Init initializes the provider specific completion request with the generic completion request.
func (*ChatRequest) SetStream ¶
func (c *ChatRequest) SetStream(stream bool)
type ChatResponse ¶
type ChatResponse struct {
Created base.Time `json:"created"`
SystemFingerprint string `json:"system_fingerprint"`
Object string `json:"object"` // "chat.completion"
ID string `json:"id"`
Timings Timings `json:"timings"`
Usage Usage `json:"usage"`
Choices []struct {
FinishReason FinishReason `json:"finish_reason"`
Index int64 `json:"index"`
Message Message `json:"message"`
Logprobs Logprobs `json:"logprobs"`
} `json:"choices"`
Model string `json:"model"` // "gpt-3.5-turbo"
}
type ChatStreamChunkResponse ¶
type ChatStreamChunkResponse struct {
Created base.Time `json:"created"`
ID string `json:"id"`
Model string `json:"model"` // "gpt-3.5-turbo"
SystemFingerprint string `json:"system_fingerprint"`
Object string `json:"object"` // "chat.completion.chunk"
Choices []struct {
FinishReason FinishReason `json:"finish_reason"`
Index int64 `json:"index"`
Delta struct {
Role string `json:"role"`
Content string `json:"content"`
ToolCalls []ToolCall `json:"tool_calls"`
} `json:"delta"`
Logprobs Logprobs `json:"logprobs"`
} `json:"choices"`
Usage Usage `json:"usage"`
Timings Timings `json:"timings"`
}
type Client ¶
type Client struct {
base.NotImplemented
// contains filtered or unexported fields
}
Client implements genai.Provider.
func New ¶
func New(ctx context.Context, opts *genai.ProviderOptions, wrapper func(http.RoundTripper) http.RoundTripper) (*Client, error)
New creates a new client to talk to a llama-server instance.
Options Remote defaults to "http://localhost:8080".
Automatic model selection via ModelCheap, ModelGood, ModelSOTA is not supported. It will ask llama-server to determine which model is already loaded.
wrapper optionally wraps the HTTP transport. Useful for HTTP recording and playback, or to tweak HTTP retries, or to throttle outgoing requests.
func (*Client) Completion ¶
func (*Client) CompletionRaw ¶
func (c *Client) CompletionRaw(ctx context.Context, in *CompletionRequest, out *CompletionResponse) error
func (*Client) CompletionStream ¶
func (*Client) CompletionStreamRaw ¶
func (c *Client) CompletionStreamRaw(ctx context.Context, in *CompletionRequest) (iter.Seq[CompletionStreamChunkResponse], func() error)
func (*Client) GenStream ¶
func (c *Client) GenStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))
GenStream implements genai.Provider.
func (*Client) GenStreamRaw ¶
func (c *Client) GenStreamRaw(ctx context.Context, in *ChatRequest) (iter.Seq[ChatStreamChunkResponse], func() error)
GenStreamRaw provides access to the raw API.
func (*Client) GenSync ¶
func (c *Client) GenSync(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)
GenSync implements genai.Provider.
Example ¶
package main
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"strings"
"github.com/maruel/genai"
"github.com/maruel/genai/providers/llamacpp"
"github.com/maruel/genai/providers/llamacpp/llamacppsrv"
"github.com/maruel/huggingface"
)
func main() {
// Download and start the server.
ctx := context.Background()
// Start a server with a minimalist model: Qwen2 0.5B in Q2_K quantization.
srv, err := startServer(ctx, "Qwen", "Qwen2-0.5B-Instruct-GGUF", "qwen2-0_5b-instruct-q2_k.gguf", "")
if err != nil {
log.Print(err)
return
}
defer srv.Close()
// Connect the provider.
c, err := llamacpp.New(ctx, &genai.ProviderOptions{Remote: srv.URL(), Model: genai.ModelNone}, nil)
if err != nil {
log.Print(err)
return
}
msgs := genai.Messages{
genai.NewTextMessage("Say hello. Reply with only one word."),
}
opts := genai.OptionsText{
Seed: 1,
Temperature: 0.01,
MaxTokens: 50,
}
resp, err := c.GenSync(ctx, msgs, &opts)
if err != nil {
log.Print(err)
return
}
log.Printf("Raw response: %#v", resp)
// Normalize some of the variance. Obviously many models will still fail this test.
fmt.Printf("Response: %s\n", strings.TrimRight(strings.TrimSpace(strings.ToLower(resp.String())), ".!"))
// Disabled because it's slow in CI, especially on Windows.
// // Output: Response: hello
}
// startServer starts a server.
func startServer(ctx context.Context, author, repo, modelfile, multimodal string) (*llamacppsrv.Server, error) {
cache, err := filepath.Abs("testdata/tmp")
if err != nil {
return nil, err
}
if err = os.MkdirAll(cache, 0o755); err != nil {
return nil, err
}
// It's a bit inefficient to download from github every single time.
exe, err := llamacppsrv.DownloadRelease(ctx, cache, llamacppsrv.BuildNumber)
if err != nil {
return nil, err
}
// llama.cpp now knows how to pull from huggingface but this was not integrated yet, so pull a model
// manually.
hf, err := huggingface.New("")
if err != nil {
return nil, err
}
modelPath, err := hf.EnsureFile(ctx, huggingface.ModelRef{Author: author, Repo: repo}, "HEAD", modelfile)
if err != nil {
return nil, err
}
extraArgs := []string{"--no-warmup", "--jinja", "--flash-attn", "--cache-type-k", "q8_0", "--cache-type-v", "q8_0"}
mmPath := ""
if multimodal != "" {
if mmPath, err = hf.EnsureFile(ctx, huggingface.ModelRef{Author: author, Repo: repo}, "HEAD", multimodal); err != nil {
return nil, err
}
extraArgs = append(extraArgs, "--mmproj", mmPath)
}
l, err := os.Create(filepath.Join(cache, "llama-server.log"))
if err != nil {
return nil, err
}
defer l.Close()
return llamacppsrv.New(ctx, exe, modelPath, l, "", 0, extraArgs)
}
func (*Client) GenSyncRaw ¶
func (c *Client) GenSyncRaw(ctx context.Context, in *ChatRequest, out *ChatResponse) error
GenSyncRaw provides access to the raw API.
func (*Client) GetHealthRaw ¶
func (c *Client) GetHealthRaw(ctx context.Context) (HealthResponse, error)
func (*Client) GetMetrics ¶
GetMetrics retrieves the performance statistics from the server.
func (*Client) HTTPClient ¶
HTTPClient returns the HTTP client to fetch results (e.g. videos) generated by the provider.
func (*Client) ListModels ¶
ListModels implements genai.Provider.
func (*Client) ModelID ¶
ModelID implements genai.Provider.
It returns the selected model ID or what was discovered from the server.
func (*Client) OutputModalities ¶
func (c *Client) OutputModalities() genai.Modalities
OutputModalities implements genai.Provider.
It returns the output modalities, i.e. what kind of output the model will generate (text, audio, image, video, etc).
func (*Client) Scoreboard ¶
func (c *Client) Scoreboard() scoreboard.Score
Scoreboard implements genai.Provider.
type CompletionRequest ¶
type CompletionRequest struct {
// TODO: Prompt can be a string, a list of tokens or a mix.
Prompt string `json:"prompt"`
Temperature float64 `json:"temperature,omitzero"`
DynaTempRange float64 `json:"dynatemp_range,omitzero"`
DynaTempExponent float64 `json:"dynatemp_exponent,omitzero"`
TopK int64 `json:"top_k,omitzero"`
TopP float64 `json:"top_p,omitzero"`
MinP float64 `json:"min_p,omitzero"`
NPredict int64 `json:"n_predict,omitzero"` // Maximum number of tokens to predict
NIndent int64 `json:"n_indent,omitzero"`
NKeep int64 `json:"n_keep,omitzero"`
Stream bool `json:"stream"`
Stop []string `json:"stop,omitzero"`
TypicalP float64 `json:"typical_p,omitzero"`
RepeatPenalty float64 `json:"repeat_penalty,omitzero"`
RepeatLastN int64 `json:"repeat_last_n,omitzero"`
PresencePenalty float64 `json:"presence_penalty,omitzero"`
FrequencyPenalty float64 `json:"frequency_penalty,omitzero"`
DryMultiplier float64 `json:"dry_multiplier,omitzero"`
DryBase float64 `json:"dry_base,omitzero"`
DryAllowedLength int64 `json:"dry_allowed_length,omitzero"`
DryPenaltyLastN int64 `json:"dry_penalty_last_n,omitzero"`
DrySequenceBreakers []string `json:"dry_sequence_breakers,omitzero"`
XTCProbability float64 `json:"xtc_probability,omitzero"`
XTCThreshold float64 `json:"xtc_threshold,omitzero"`
Mirostat int32 `json:"mirostat,omitzero"`
MirostatTau float64 `json:"mirostat_tau,omitzero"`
MirostatEta float64 `json:"mirostat_eta,omitzero"`
Grammar string `json:"grammar,omitzero"`
JSONSchema *jsonschema.Schema `json:"json_schema,omitzero"`
Seed int64 `json:"seed,omitzero"`
IgnoreEos bool `json:"ignore_eos,omitzero"`
LogitBias []any `json:"logit_bias,omitzero"`
Nprobs int64 `json:"n_probs,omitzero"`
MinKeep int64 `json:"min_keep,omitzero"`
TMaxPredictMS int64 `json:"t_max_predict_ms,omitzero"`
ImageData []any `json:"image_data,omitzero"`
IDSlot int64 `json:"id_slot,omitzero"`
CachePrompt bool `json:"cache_prompt,omitzero"`
ReturnTokens bool `json:"return_tokens,omitzero"`
Samplers []string `json:"samplers,omitzero"`
TimingsPerToken bool `json:"timings_per_token,omitzero"`
PostSamplingProbs bool `json:"post_sampling_probs,omitzero"`
ResponseFields []string `json:"response_fields,omitzero"`
Lora []Lora `json:"lora,omitzero"`
}
CompletionRequest is documented at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#post-completion-given-a-prompt-it-returns-the-predicted-completion
type CompletionResponse ¶
type CompletionResponse struct {
Index int64 `json:"index"`
Content string `json:"content"`
Tokens []int64 `json:"tokens"`
IDSlot int64 `json:"id_slot"`
Stop bool `json:"stop"`
Model string `json:"model"`
TokensPredicted int64 `json:"tokens_predicted"`
TokensEvaluated int64 `json:"tokens_evaluated"`
GenerationSettings struct {
NPredict int64 `json:"n_predict"`
Seed int64 `json:"seed"`
Temperature float64 `json:"temperature"`
DynaTempRange float64 `json:"dynatemp_range"`
DynaTempExponent float64 `json:"dynatemp_exponent"`
TopK int64 `json:"top_k"`
TopP float64 `json:"top_p"`
MinP float64 `json:"min_p"`
XTCProbability float64 `json:"xtc_probability"`
XTCThreshold float64 `json:"xtc_threshold"`
TypicalP float64 `json:"typical_p"`
RepeatLastN int64 `json:"repeat_last_n"`
RepeatPenalty float64 `json:"repeat_penalty"`
PresencePenalty float64 `json:"presence_penalty"`
FrequencyPenalty float64 `json:"frequency_penalty"`
DryMultiplier float64 `json:"dry_multiplier"`
DryBase float64 `json:"dry_base"`
DryAllowedLength int64 `json:"dry_allowed_length"`
DryPenaltyLastN int64 `json:"dry_penalty_last_n"`
DrySequenceBreakers []string `json:"dry_sequence_breakers"`
Mirostat int32 `json:"mirostat"`
MirostatTau float64 `json:"mirostat_tau"`
MirostatEta float64 `json:"mirostat_eta"`
Stop []string `json:"stop"`
MaxTokens int64 `json:"max_tokens"`
NKeep int64 `json:"n_keep"`
NDiscard int64 `json:"n_discard"`
IgnoreEos bool `json:"ignore_eos"`
Stream bool `json:"stream"`
LogitBias []any `json:"logit_bias"`
NProbs int64 `json:"n_probs"`
MinKeep int64 `json:"min_keep"`
Grammar string `json:"grammar"`
GrammarLazy bool `json:"grammar_lazy"`
GrammarTriggers []string `json:"grammar_triggers"`
PreservedTokens []string `json:"preserved_tokens"`
ChatFormat string `json:"chat_format"`
ReasoningFormat string `json:"reasoning_format"`
ReasoningInContent bool `json:"reasoning_in_content"`
ThinkingForcedOpen bool `json:"thinking_forced_open"`
Samplers []string `json:"samplers"`
SpeculativeNMax int64 `json:"speculative.n_max"`
SpeculativeNMin int64 `json:"speculative.n_min"`
SpeculativePMin float64 `json:"speculative.p_min"`
TimingsPerToken bool `json:"timings_per_token"`
PostSamplingProbs bool `json:"post_sampling_probs"`
Lora []Lora `json:"lora"`
TopNSigma float64 `json:"top_n_sigma"`
} `json:"generation_settings"`
Prompt string `json:"prompt"`
HasNewLine bool `json:"has_new_line"`
Truncated bool `json:"truncated"`
StopType StopType `json:"stop_type"`
StoppingWord string `json:"stopping_word"`
TokensCached int64 `json:"tokens_cached"`
Timings Timings `json:"timings"`
}
type CompletionStreamChunkResponse ¶
type CompletionStreamChunkResponse struct {
// Always
Index int64 `json:"index"`
Content string `json:"content"`
Tokens []int64 `json:"tokens"`
Stop bool `json:"stop"`
IDSlot int64 `json:"id_slot"`
TokensPredicted int64 `json:"tokens_predicted"`
TokensEvaluated int64 `json:"tokens_evaluated"`
// Last message
Model string `json:"model"`
GenerationSettings struct{} `json:"generation_settings"`
Prompt string `json:"prompt"`
HasNewLine bool `json:"has_new_line"`
Truncated bool `json:"truncated"`
StopType StopType `json:"stop_type"`
StoppingWord string `json:"stopping_word"`
TokensCached int64 `json:"tokens_cached"`
Timings Timings `json:"timings"`
}
type Content ¶
type Content struct {
Type string `json:"type"` // "text", "image_url", "input_audio"
// Type == "text"
Text string `json:"text,omitzero"`
// Type == "image_url"
ImageURL struct {
URL string `json:"url,omitzero"`
} `json:"image_url,omitzero"`
InputAudio struct {
Data []byte `json:"data,omitzero"`
Format string `json:"format,omitzero"` // "mp3", "wav"
} `json:"input_audio,omitzero"`
}
Content is not documented.
You can look at how it's used in oaicompat_chat_params_parse() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/utils.hpp
type Contents ¶
type Contents []Content
func (*Contents) UnmarshalJSON ¶
UnmarshalJSON implements custom unmarshalling for Contents type to handle cases where content could be a string or []Content.
type ErrorResponse ¶
type ErrorResponse struct {
ErrorVal struct {
Code int64 `json:"code"`
Message string `json:"message"`
Type string `json:"type"`
} `json:"error"`
}
func (*ErrorResponse) Error ¶
func (er *ErrorResponse) Error() string
func (*ErrorResponse) IsAPIError ¶
func (er *ErrorResponse) IsAPIError() bool
type FinishReason ¶
type FinishReason string
const ( FinishedStop FinishReason = "stop" FinishedLength FinishReason = "length" FinishedToolCalls FinishReason = "tool_calls" )
func (FinishReason) ToFinishReason ¶
func (f FinishReason) ToFinishReason() genai.FinishReason
type HealthResponse ¶
type HealthResponse struct {
Status string `json:"status"`
SlotsIdle int64 `json:"slots_idle"`
SlotsProcessing int64 `json:"slots_processing"`
}
HealthResponse is documented at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#get-health-returns-heath-check-result
type Logprobs ¶
type Logprobs struct {
Content []struct {
ID int64 `json:"id"`
Token string `json:"token"`
Bytes []byte `json:"bytes"`
Logprob float64 `json:"logprob"`
TopLogprobs []struct {
ID int64 `json:"id"`
Token string `json:"token"`
Bytes []byte `json:"bytes"`
Logprob float64 `json:"logprob"`
} `json:"top_logprobs"`
} `json:"content"`
}
type Message ¶
type Message struct {
Role string `json:"role"` // "system", "assistant", "user", "tool"
Content Contents `json:"content,omitzero"`
ToolCalls []ToolCall `json:"tool_calls,omitzero"`
ReasoningContent string `json:"reasoning_content,omitzero"`
Name string `json:"name,omitzero"`
ToolCallID string `json:"tool_call_id,omitzero"`
}
Message is not documented.
You can look at how it's used in oaicompat_chat_params_parse() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/utils.hpp and common_chat_msgs_parse_oaicompat() in https://github.com/ggml-org/llama.cpp/blob/master/common/chat.cpp
type Metrics ¶
type Metrics struct {
Prompt TokenPerformance
Generated TokenPerformance
KVCacheUsage float64
KVCacheTokens int
RequestsProcessing int
RequestedPending int
}
Metrics represents the metrics for the LLM server.
type Model ¶
type Model struct {
HF ModelHF
OpenAI ModelOpenAI
}
Model is a synthetic struct combining the information from both ModelHF and ModelOpenAI
type ModelHF ¶
type ModelHF struct {
Name string `json:"name"` // Path to the file
Model string `json:"model"` // Path to the file
ModifiedAt string `json:"modified_at"` // Dummy
Size string `json:"size"` // Dummy
Digest string `json:"digest"` // Dummy
Type string `json:"type"` // "model"
Description string `json:"description"` // Dummy
Tags []string `json:"tags"` // Dummy
Capabilities []string `json:"capabilities"` // "completion" (hardcoded)
Parameters string `json:"parameters"` // Dummy
Details struct {
ParentModel string `json:"parent_model"` // Dummy
Format string `json:"format"` // "gguf" (hardcoded)
Family string `json:"family"` // Dummy
Families []string `json:"families"` // Dummy
ParameterSize string `json:"parameter_size"` // Dummy
QuantizationLevel string `json:"quantization_level"` // Dummy
} `json:"details"`
}
type ModelOpenAI ¶
type ModelOpenAI struct {
ID string `json:"id"` // Path to the file
Object string `json:"object"` // "model"
Created base.Time `json:"created"` // Dummy
OwnedBy string `json:"owned_by"` // "llamacpp"
Meta struct {
VocabType int64 `json:"vocab_type"` // 1
NVocab int64 `json:"n_vocab"`
NCtxTrain int64 `json:"n_ctx_train"`
NEmbd int64 `json:"n_embd"`
NParams int64 `json:"n_params"`
Size int64 `json:"size"`
} `json:"meta"`
}
type ModelsResponse ¶
type ModelsResponse struct {
Models []ModelHF `json:"models"`
Object string `json:"object"` // "list"
Data []ModelOpenAI `json:"data"`
}
ModelsResponse is not documented.
See handle_models() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/server.cpp
func (*ModelsResponse) ToModels ¶
func (m *ModelsResponse) ToModels() []genai.Model
type PromptEncoding ¶
type PromptEncoding struct {
// Prompt encoding.
BeginOfText string `yaml:"begin_of_text"`
SystemTokenStart string `yaml:"system_token_start"`
SystemTokenEnd string `yaml:"system_token_end"`
UserTokenStart string `yaml:"user_token_start"`
UserTokenEnd string `yaml:"user_token_end"`
AssistantTokenStart string `yaml:"assistant_token_start"`
AssistantTokenEnd string `yaml:"assistant_token_end"`
ToolsAvailableTokenStart string `yaml:"tools_available_token_start"`
ToolsAvailableTokenEnd string `yaml:"tools_available_token_end"`
ToolCallTokenStart string `yaml:"tool_call_token_start"`
ToolCallTokenEnd string `yaml:"tool_call_token_end"`
ToolCallResultTokenStart string `yaml:"tool_call_result_token_start"`
ToolCallResultTokenEnd string `yaml:"tool_call_result_token_end"`
// contains filtered or unexported fields
}
PromptEncoding describes how to encode the prompt.
func (*PromptEncoding) Validate ¶
func (p *PromptEncoding) Validate() error
Validate checks for obvious errors in the fields.
type StopType ¶
type StopType string
func (StopType) ToFinishReason ¶
func (s StopType) ToFinishReason() genai.FinishReason
type Timings ¶
type Timings struct {
PromptN int64 `json:"prompt_n"`
PromptMS float64 `json:"prompt_ms"`
PromptPerTokenMS float64 `json:"prompt_per_token_ms"`
PromptPerSecond float64 `json:"prompt_per_second"`
PredictedN int64 `json:"predicted_n"`
PredictedMS float64 `json:"predicted_ms"`
PredictedPerTokenMS float64 `json:"predicted_per_token_ms"`
PredictedPerSecond float64 `json:"predicted_per_second"`
}
type TokenPerformance ¶
TokenPerformance is the performance for the metrics
func (*TokenPerformance) Rate ¶
func (t *TokenPerformance) Rate() float64
Rate is the number of token per second.
func (*TokenPerformance) String ¶
func (t *TokenPerformance) String() string
type Tool ¶
type Tool struct {
Type string `json:"type"` // "function"
Function struct {
Name string `json:"name"`
Description string `json:"description"`
Parameters *jsonschema.Schema `json:"parameters"`
} `json:"function"`
}
Tool is not documented.
It's purely handled by the chat templates, thus its real structure varies from model to model. See https://github.com/ggml-org/llama.cpp/blob/master/common/chat.cpp
type ToolCall ¶
type ToolCall struct {
Type string `json:"type"` // "function"
Index int64 `json:"index"`
ID string `json:"id,omitzero"`
Function struct {
Name string `json:"name,omitzero"`
Arguments string `json:"arguments,omitzero"`
} `json:"function"`
}
ToolCall is not documented.
You can look at how it's used in common_chat_msgs_parse_oaicompat() in https://github.com/ggml-org/llama.cpp/blob/master/common/chat.cpp
Directories
¶
| Path | Synopsis |
|---|---|
|
Package llamacppsrv downloads and starts llama-server from llama.cpp, directly from GitHub releases.
|
Package llamacppsrv downloads and starts llama-server from llama.cpp, directly from GitHub releases. |