llamacpp

package

v0.0.0-...-77279d1 Latest Latest Go to latest Published: Dec 21, 2025 License: Apache-2.0 Imports: 25 Imported by: 1

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/maruel/genai

Links

Open Source Insights

Documentation ¶

Overview ¶

Package llamacpp implements a client for the llama-server native API, not the OpenAI compatible one.

It is described at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#api-endpoints

The implementation is at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/server.cpp

Index ¶

func ProcessCompletionStream(chunks iter.Seq[CompletionStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))
func ProcessStream(chunks iter.Seq[ChatStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))
func Scoreboard() scoreboard.Score
type ChatRequest
- func (c *ChatRequest) Init(msgs genai.Messages, model string, opts ...genai.Options) error
- func (c *ChatRequest) SetStream(stream bool)
type ChatResponse
- func (c *ChatResponse) ToResult() (genai.Result, error)
type ChatStreamChunkResponse
type Client
- func New(ctx context.Context, opts *genai.ProviderOptions, ...) (*Client, error)
- func (c *Client) Completion(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)
- func (c *Client) CompletionRaw(ctx context.Context, in *CompletionRequest, out *CompletionResponse) error
- func (c *Client) CompletionStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))
- func (c *Client) CompletionStreamRaw(ctx context.Context, in *CompletionRequest) (iter.Seq[CompletionStreamChunkResponse], func() error)
- func (c *Client) GenStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))
- func (c *Client) GenStreamRaw(ctx context.Context, in *ChatRequest) (iter.Seq[ChatStreamChunkResponse], func() error)
- func (c *Client) GenSync(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)
- func (c *Client) GenSyncRaw(ctx context.Context, in *ChatRequest, out *ChatResponse) error
- func (c *Client) GetHealth(ctx context.Context) (string, error)
- func (c *Client) GetHealthRaw(ctx context.Context) (HealthResponse, error)
- func (c *Client) GetMetrics(ctx context.Context, m *Metrics) error
- func (c *Client) HTTPClient() *http.Client
- func (c *Client) ListModels(ctx context.Context) ([]genai.Model, error)
- func (c *Client) ModelID() string
- func (c *Client) Name() string
- func (c *Client) OutputModalities() genai.Modalities
- func (c *Client) Ping(ctx context.Context) error
- func (c *Client) Scoreboard() scoreboard.Score
type CompletionRequest
- func (c *CompletionRequest) Init(msgs genai.Messages, model string, opts ...genai.Options) error
type CompletionResponse
- func (c *CompletionResponse) ToResult() (genai.Result, error)
type CompletionStreamChunkResponse
type Content
- func (c *Content) FromReply(in *genai.Reply) (bool, error)
- func (c *Content) FromRequest(in *genai.Request) (bool, error)
- func (c *Content) To(out *genai.Reply) error
type Contents
- func (c *Contents) UnmarshalJSON(b []byte) error
type ErrorResponse
- func (er *ErrorResponse) Error() string
- func (er *ErrorResponse) IsAPIError() bool
type FinishReason
- func (f FinishReason) ToFinishReason() genai.FinishReason
type HealthResponse
type Logprobs
- func (l *Logprobs) To() [][]genai.Logprob
type Lora
type Message
- func (m *Message) From(in *genai.Message) error
- func (m *Message) To(out *genai.Message) error
type Metrics
type Model
- func (m *Model) Context() int64
- func (m *Model) GetID() string
- func (m *Model) String() string
type ModelHF
type ModelOpenAI
type ModelsResponse
- func (m *ModelsResponse) ToModels() []genai.Model
type PromptEncoding
- func (p *PromptEncoding) Validate() error
type StopType
- func (s StopType) ToFinishReason() genai.FinishReason
type Timings
type TokenPerformance
- func (t *TokenPerformance) Rate() float64
- func (t *TokenPerformance) String() string
type Tool
type ToolCall
- func (t *ToolCall) From(in *genai.ToolCall) error
- func (t *ToolCall) To(out *genai.ToolCall)
type Usage

Examples ¶

Client.GenSync

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func ProcessCompletionStream ¶

func ProcessCompletionStream(chunks iter.Seq[CompletionStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))

ProcessCompletionStream converts the raw packets from the completion streaming API into Reply fragments.

func ProcessStream ¶

func ProcessStream(chunks iter.Seq[ChatStreamChunkResponse]) (iter.Seq[genai.Reply], func() (genai.Usage, [][]genai.Logprob, error))

ProcessStream converts the raw packets from the streaming API into Reply fragments.

func Scoreboard ¶

func Scoreboard() scoreboard.Score

Scoreboard for llama.cpp.

Types ¶

type ChatRequest ¶

type ChatRequest struct {
	Stream         bool      `json:"stream,omitzero"`
	Model          string    `json:"model,omitzero"`
	MaxTokens      int64     `json:"max_tokens,omitzero"`
	Messages       []Message `json:"messages"`
	ResponseFormat struct {
		Type       string `json:"type,omitzero"` // Default: "text"; "json_object", "json_schema"
		JSONSchema struct {
			Schema *jsonschema.Schema `json:"schema,omitzero"`
		} `json:"json_schema,omitzero"`
	} `json:"response_format,omitzero"`
	Grammar         string `json:"grammar,omitzero"`
	TimingsPerToken bool   `json:"timings_per_token,omitzero"`

	Tools               []Tool   `json:"tools,omitzero"`
	ToolChoice          string   `json:"tool_choice,omitzero"` // Default: "auto"; "none", "required"
	Stop                []string `json:"stop,omitzero"`
	ParallelToolCalls   bool     `json:"parallel_tool_calls,omitzero"`
	AddGenerationPrompt bool     `json:"add_generation_prompt,omitzero"`
	// ReasoningFormat     struct{}   `json:"reasoning_format,omitzero"`
	// EnableThinking      bool       `json:"enable_thinking,omitzero"`
	ChatTemplateKWArgs map[string]string `json:"chat_template_kwargs,omitzero"`
	N                  int64             `json:"n,omitzero"` // Must be 1 anyway.
	Logprobs           bool              `json:"logprobs,omitzero"`
	TopLogprobs        int64             `json:"top_logprobs,omitzero"` // Requires Logprobs:true

	// Prompt              string             `json:"prompt"`
	Temperature         float64  `json:"temperature,omitzero"`
	DynaTempRange       float64  `json:"dynatemp_range,omitzero"`
	DynaTempExponent    float64  `json:"dynatemp_exponent,omitzero"`
	TopK                int64    `json:"top_k,omitzero"`
	TopP                float64  `json:"top_p,omitzero"`
	MinP                float64  `json:"min_p,omitzero"`
	NPredict            int64    `json:"n_predict,omitzero"` // Maximum number of tokens to predict
	NIndent             int64    `json:"n_indent,omitzero"`
	NKeep               int64    `json:"n_keep,omitzero"`
	TypicalP            float64  `json:"typical_p,omitzero"`
	RepeatPenalty       float64  `json:"repeat_penalty,omitzero"`
	RepeatLastN         int64    `json:"repeat_last_n,omitzero"`
	PresencePenalty     float64  `json:"presence_penalty,omitzero"`
	FrequencyPenalty    float64  `json:"frequency_penalty,omitzero"`
	DryMultiplier       float64  `json:"dry_multiplier,omitzero"`
	DryBase             float64  `json:"dry_base,omitzero"`
	DryAllowedLength    int64    `json:"dry_allowed_length,omitzero"`
	DryPenaltyLastN     int64    `json:"dry_penalty_last_n,omitzero"`
	DrySequenceBreakers []string `json:"dry_sequence_breakers,omitzero"`
	XTCProbability      float64  `json:"xtc_probability,omitzero"`
	XTCThreshold        float64  `json:"xtc_threshold,omitzero"`
	Mirostat            int32    `json:"mirostat,omitzero"`
	MirostatTau         float64  `json:"mirostat_tau,omitzero"`
	MirostatEta         float64  `json:"mirostat_eta,omitzero"`
	Seed                int64    `json:"seed,omitzero"`
	IgnoreEos           bool     `json:"ignore_eos,omitzero"`
	LogitBias           []any    `json:"logit_bias,omitzero"`
	Nprobs              int64    `json:"n_probs,omitzero"`
	MinKeep             int64    `json:"min_keep,omitzero"`
	TMaxPredictMS       int64    `json:"t_max_predict_ms,omitzero"`
	ImageData           []any    `json:"image_data,omitzero"`
	IDSlot              int64    `json:"id_slot,omitzero"`
	CachePrompt         bool     `json:"cache_prompt,omitzero"`
	ReturnTokens        bool     `json:"return_tokens,omitzero"`
	Samplers            []string `json:"samplers,omitzero"`
	PostSamplingProbs   bool     `json:"post_sampling_probs,omitzero"`
	ResponseFields      []string `json:"response_fields,omitzero"`
	Lora                []Lora   `json:"lora,omitzero"`
}

ChatRequest is not documented.

Better take a look at oaicompat_chat_params_parse() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/utils.hpp

func (*ChatRequest) Init ¶

func (c *ChatRequest) Init(msgs genai.Messages, model string, opts ...genai.Options) error

Init initializes the provider specific completion request with the generic completion request.

func (*ChatRequest) SetStream ¶

func (c *ChatRequest) SetStream(stream bool)

type ChatResponse ¶

type ChatResponse struct {
	Created           base.Time `json:"created"`
	SystemFingerprint string    `json:"system_fingerprint"`
	Object            string    `json:"object"` // "chat.completion"
	ID                string    `json:"id"`
	Timings           Timings   `json:"timings"`
	Usage             Usage     `json:"usage"`
	Choices           []struct {
		FinishReason FinishReason `json:"finish_reason"`
		Index        int64        `json:"index"`
		Message      Message      `json:"message"`
		Logprobs     Logprobs     `json:"logprobs"`
	} `json:"choices"`
	Model string `json:"model"` // "gpt-3.5-turbo"
}

func (*ChatResponse) ToResult ¶

func (c *ChatResponse) ToResult() (genai.Result, error)

type ChatStreamChunkResponse ¶

type ChatStreamChunkResponse struct {
	Created           base.Time `json:"created"`
	ID                string    `json:"id"`
	Model             string    `json:"model"` // "gpt-3.5-turbo"
	SystemFingerprint string    `json:"system_fingerprint"`
	Object            string    `json:"object"` // "chat.completion.chunk"
	Choices           []struct {
		FinishReason FinishReason `json:"finish_reason"`
		Index        int64        `json:"index"`
		Delta        struct {
			Role      string     `json:"role"`
			Content   string     `json:"content"`
			ToolCalls []ToolCall `json:"tool_calls"`
		} `json:"delta"`
		Logprobs Logprobs `json:"logprobs"`
	} `json:"choices"`
	Usage   Usage   `json:"usage"`
	Timings Timings `json:"timings"`
}

type Client ¶

type Client struct {
	base.NotImplemented
	// contains filtered or unexported fields
}

Client implements genai.Provider.

func New ¶

func New(ctx context.Context, opts *genai.ProviderOptions, wrapper func(http.RoundTripper) http.RoundTripper) (*Client, error)

New creates a new client to talk to a llama-server instance.

Options Remote defaults to "http://localhost:8080".

Automatic model selection via ModelCheap, ModelGood, ModelSOTA is not supported. It will ask llama-server to determine which model is already loaded.

wrapper optionally wraps the HTTP transport. Useful for HTTP recording and playback, or to tweak HTTP retries, or to throttle outgoing requests.

func (*Client) Completion ¶

func (c *Client) Completion(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)

func (*Client) CompletionRaw ¶

func (c *Client) CompletionRaw(ctx context.Context, in *CompletionRequest, out *CompletionResponse) error

func (*Client) CompletionStream ¶

func (c *Client) CompletionStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))

func (*Client) CompletionStreamRaw ¶

func (c *Client) CompletionStreamRaw(ctx context.Context, in *CompletionRequest) (iter.Seq[CompletionStreamChunkResponse], func() error)

func (*Client) GenStream ¶

func (c *Client) GenStream(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (iter.Seq[genai.Reply], func() (genai.Result, error))

GenStream implements genai.Provider.

func (*Client) GenStreamRaw ¶

func (c *Client) GenStreamRaw(ctx context.Context, in *ChatRequest) (iter.Seq[ChatStreamChunkResponse], func() error)

GenStreamRaw provides access to the raw API.

func (*Client) GenSync ¶

func (c *Client) GenSync(ctx context.Context, msgs genai.Messages, opts ...genai.Options) (genai.Result, error)

GenSync implements genai.Provider.

Example ¶

package main

import (
	"context"
	"fmt"
	"log"
	"os"
	"path/filepath"
	"strings"

	"github.com/maruel/genai"
	"github.com/maruel/genai/providers/llamacpp"
	"github.com/maruel/genai/providers/llamacpp/llamacppsrv"
	"github.com/maruel/huggingface"
)

func main() {
	// Download and start the server.
	ctx := context.Background()
	// Start a server with a minimalist model: Qwen2 0.5B in Q2_K quantization.
	srv, err := startServer(ctx, "Qwen", "Qwen2-0.5B-Instruct-GGUF", "qwen2-0_5b-instruct-q2_k.gguf", "")
	if err != nil {
		log.Print(err)
		return
	}
	defer srv.Close()
	// Connect the provider.
	c, err := llamacpp.New(ctx, &genai.ProviderOptions{Remote: srv.URL(), Model: genai.ModelNone}, nil)
	if err != nil {
		log.Print(err)
		return
	}
	msgs := genai.Messages{
		genai.NewTextMessage("Say hello. Reply with only one word."),
	}
	opts := genai.OptionsText{
		Seed:        1,
		Temperature: 0.01,
		MaxTokens:   50,
	}
	resp, err := c.GenSync(ctx, msgs, &opts)
	if err != nil {
		log.Print(err)
		return
	}
	log.Printf("Raw response: %#v", resp)
	// Normalize some of the variance. Obviously many models will still fail this test.
	fmt.Printf("Response: %s\n", strings.TrimRight(strings.TrimSpace(strings.ToLower(resp.String())), ".!"))
	// Disabled because it's slow in CI, especially on Windows.
	// // Output: Response: hello
}

// startServer starts a server.
func startServer(ctx context.Context, author, repo, modelfile, multimodal string) (*llamacppsrv.Server, error) {
	cache, err := filepath.Abs("testdata/tmp")
	if err != nil {
		return nil, err
	}
	if err = os.MkdirAll(cache, 0o755); err != nil {
		return nil, err
	}
	// It's a bit inefficient to download from github every single time.
	exe, err := llamacppsrv.DownloadRelease(ctx, cache, llamacppsrv.BuildNumber)
	if err != nil {
		return nil, err
	}
	// llama.cpp now knows how to pull from huggingface but this was not integrated yet, so pull a model
	// manually.
	hf, err := huggingface.New("")
	if err != nil {
		return nil, err
	}
	modelPath, err := hf.EnsureFile(ctx, huggingface.ModelRef{Author: author, Repo: repo}, "HEAD", modelfile)
	if err != nil {
		return nil, err
	}
	extraArgs := []string{"--no-warmup", "--jinja", "--flash-attn", "--cache-type-k", "q8_0", "--cache-type-v", "q8_0"}
	mmPath := ""
	if multimodal != "" {
		if mmPath, err = hf.EnsureFile(ctx, huggingface.ModelRef{Author: author, Repo: repo}, "HEAD", multimodal); err != nil {
			return nil, err
		}
		extraArgs = append(extraArgs, "--mmproj", mmPath)
	}
	l, err := os.Create(filepath.Join(cache, "llama-server.log"))
	if err != nil {
		return nil, err
	}
	defer l.Close()
	return llamacppsrv.New(ctx, exe, modelPath, l, "", 0, extraArgs)
}

func (*Client) GenSyncRaw ¶

func (c *Client) GenSyncRaw(ctx context.Context, in *ChatRequest, out *ChatResponse) error

GenSyncRaw provides access to the raw API.

func (*Client) GetHealth ¶

func (c *Client) GetHealth(ctx context.Context) (string, error)

func (*Client) GetHealthRaw ¶

func (c *Client) GetHealthRaw(ctx context.Context) (HealthResponse, error)

func (*Client) GetMetrics ¶

func (c *Client) GetMetrics(ctx context.Context, m *Metrics) error

GetMetrics retrieves the performance statistics from the server.

func (*Client) HTTPClient ¶

func (c *Client) HTTPClient() *http.Client

HTTPClient returns the HTTP client to fetch results (e.g. videos) generated by the provider.

func (*Client) ListModels ¶

func (c *Client) ListModels(ctx context.Context) ([]genai.Model, error)

ListModels implements genai.Provider.

func (*Client) ModelID ¶

func (c *Client) ModelID() string

ModelID implements genai.Provider.

It returns the selected model ID or what was discovered from the server.

func (*Client) Name ¶

func (c *Client) Name() string

Name implements genai.Provider.

It returns the name of the provider.

func (*Client) OutputModalities ¶

func (c *Client) OutputModalities() genai.Modalities

OutputModalities implements genai.Provider.

It returns the output modalities, i.e. what kind of output the model will generate (text, audio, image, video, etc).

func (*Client) Ping ¶

func (c *Client) Ping(ctx context.Context) error

func (*Client) Scoreboard ¶

func (c *Client) Scoreboard() scoreboard.Score

Scoreboard implements genai.Provider.

type CompletionRequest ¶

type CompletionRequest struct {
	// TODO: Prompt can be a string, a list of tokens or a mix.
	Prompt              string             `json:"prompt"`
	Temperature         float64            `json:"temperature,omitzero"`
	DynaTempRange       float64            `json:"dynatemp_range,omitzero"`
	DynaTempExponent    float64            `json:"dynatemp_exponent,omitzero"`
	TopK                int64              `json:"top_k,omitzero"`
	TopP                float64            `json:"top_p,omitzero"`
	MinP                float64            `json:"min_p,omitzero"`
	NPredict            int64              `json:"n_predict,omitzero"` // Maximum number of tokens to predict
	NIndent             int64              `json:"n_indent,omitzero"`
	NKeep               int64              `json:"n_keep,omitzero"`
	Stream              bool               `json:"stream"`
	Stop                []string           `json:"stop,omitzero"`
	TypicalP            float64            `json:"typical_p,omitzero"`
	RepeatPenalty       float64            `json:"repeat_penalty,omitzero"`
	RepeatLastN         int64              `json:"repeat_last_n,omitzero"`
	PresencePenalty     float64            `json:"presence_penalty,omitzero"`
	FrequencyPenalty    float64            `json:"frequency_penalty,omitzero"`
	DryMultiplier       float64            `json:"dry_multiplier,omitzero"`
	DryBase             float64            `json:"dry_base,omitzero"`
	DryAllowedLength    int64              `json:"dry_allowed_length,omitzero"`
	DryPenaltyLastN     int64              `json:"dry_penalty_last_n,omitzero"`
	DrySequenceBreakers []string           `json:"dry_sequence_breakers,omitzero"`
	XTCProbability      float64            `json:"xtc_probability,omitzero"`
	XTCThreshold        float64            `json:"xtc_threshold,omitzero"`
	Mirostat            int32              `json:"mirostat,omitzero"`
	MirostatTau         float64            `json:"mirostat_tau,omitzero"`
	MirostatEta         float64            `json:"mirostat_eta,omitzero"`
	Grammar             string             `json:"grammar,omitzero"`
	JSONSchema          *jsonschema.Schema `json:"json_schema,omitzero"`
	Seed                int64              `json:"seed,omitzero"`
	IgnoreEos           bool               `json:"ignore_eos,omitzero"`
	LogitBias           []any              `json:"logit_bias,omitzero"`
	Nprobs              int64              `json:"n_probs,omitzero"`
	MinKeep             int64              `json:"min_keep,omitzero"`
	TMaxPredictMS       int64              `json:"t_max_predict_ms,omitzero"`
	ImageData           []any              `json:"image_data,omitzero"`
	IDSlot              int64              `json:"id_slot,omitzero"`
	CachePrompt         bool               `json:"cache_prompt,omitzero"`
	ReturnTokens        bool               `json:"return_tokens,omitzero"`
	Samplers            []string           `json:"samplers,omitzero"`
	TimingsPerToken     bool               `json:"timings_per_token,omitzero"`
	PostSamplingProbs   bool               `json:"post_sampling_probs,omitzero"`
	ResponseFields      []string           `json:"response_fields,omitzero"`
	Lora                []Lora             `json:"lora,omitzero"`
}

CompletionRequest is documented at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#post-completion-given-a-prompt-it-returns-the-predicted-completion

func (*CompletionRequest) Init ¶

func (c *CompletionRequest) Init(msgs genai.Messages, model string, opts ...genai.Options) error

Init initializes the provider specific completion request with the generic completion request.

type CompletionResponse ¶

type CompletionResponse struct {
	Index              int64   `json:"index"`
	Content            string  `json:"content"`
	Tokens             []int64 `json:"tokens"`
	IDSlot             int64   `json:"id_slot"`
	Stop               bool    `json:"stop"`
	Model              string  `json:"model"`
	TokensPredicted    int64   `json:"tokens_predicted"`
	TokensEvaluated    int64   `json:"tokens_evaluated"`
	GenerationSettings struct {
		NPredict            int64    `json:"n_predict"`
		Seed                int64    `json:"seed"`
		Temperature         float64  `json:"temperature"`
		DynaTempRange       float64  `json:"dynatemp_range"`
		DynaTempExponent    float64  `json:"dynatemp_exponent"`
		TopK                int64    `json:"top_k"`
		TopP                float64  `json:"top_p"`
		MinP                float64  `json:"min_p"`
		XTCProbability      float64  `json:"xtc_probability"`
		XTCThreshold        float64  `json:"xtc_threshold"`
		TypicalP            float64  `json:"typical_p"`
		RepeatLastN         int64    `json:"repeat_last_n"`
		RepeatPenalty       float64  `json:"repeat_penalty"`
		PresencePenalty     float64  `json:"presence_penalty"`
		FrequencyPenalty    float64  `json:"frequency_penalty"`
		DryMultiplier       float64  `json:"dry_multiplier"`
		DryBase             float64  `json:"dry_base"`
		DryAllowedLength    int64    `json:"dry_allowed_length"`
		DryPenaltyLastN     int64    `json:"dry_penalty_last_n"`
		DrySequenceBreakers []string `json:"dry_sequence_breakers"`
		Mirostat            int32    `json:"mirostat"`
		MirostatTau         float64  `json:"mirostat_tau"`
		MirostatEta         float64  `json:"mirostat_eta"`
		Stop                []string `json:"stop"`
		MaxTokens           int64    `json:"max_tokens"`
		NKeep               int64    `json:"n_keep"`
		NDiscard            int64    `json:"n_discard"`
		IgnoreEos           bool     `json:"ignore_eos"`
		Stream              bool     `json:"stream"`
		LogitBias           []any    `json:"logit_bias"`
		NProbs              int64    `json:"n_probs"`
		MinKeep             int64    `json:"min_keep"`
		Grammar             string   `json:"grammar"`
		GrammarLazy         bool     `json:"grammar_lazy"`
		GrammarTriggers     []string `json:"grammar_triggers"`
		PreservedTokens     []string `json:"preserved_tokens"`
		ChatFormat          string   `json:"chat_format"`
		ReasoningFormat     string   `json:"reasoning_format"`
		ReasoningInContent  bool     `json:"reasoning_in_content"`
		ThinkingForcedOpen  bool     `json:"thinking_forced_open"`
		Samplers            []string `json:"samplers"`
		SpeculativeNMax     int64    `json:"speculative.n_max"`
		SpeculativeNMin     int64    `json:"speculative.n_min"`
		SpeculativePMin     float64  `json:"speculative.p_min"`
		TimingsPerToken     bool     `json:"timings_per_token"`
		PostSamplingProbs   bool     `json:"post_sampling_probs"`
		Lora                []Lora   `json:"lora"`
		TopNSigma           float64  `json:"top_n_sigma"`
	} `json:"generation_settings"`
	Prompt       string   `json:"prompt"`
	HasNewLine   bool     `json:"has_new_line"`
	Truncated    bool     `json:"truncated"`
	StopType     StopType `json:"stop_type"`
	StoppingWord string   `json:"stopping_word"`
	TokensCached int64    `json:"tokens_cached"`
	Timings      Timings  `json:"timings"`
}

func (*CompletionResponse) ToResult ¶

func (c *CompletionResponse) ToResult() (genai.Result, error)

type CompletionStreamChunkResponse ¶

type CompletionStreamChunkResponse struct {
	// Always
	Index           int64   `json:"index"`
	Content         string  `json:"content"`
	Tokens          []int64 `json:"tokens"`
	Stop            bool    `json:"stop"`
	IDSlot          int64   `json:"id_slot"`
	TokensPredicted int64   `json:"tokens_predicted"`
	TokensEvaluated int64   `json:"tokens_evaluated"`

	// Last message
	Model              string   `json:"model"`
	GenerationSettings struct{} `json:"generation_settings"`
	Prompt             string   `json:"prompt"`
	HasNewLine         bool     `json:"has_new_line"`
	Truncated          bool     `json:"truncated"`
	StopType           StopType `json:"stop_type"`
	StoppingWord       string   `json:"stopping_word"`
	TokensCached       int64    `json:"tokens_cached"`
	Timings            Timings  `json:"timings"`
}

type Content ¶

type Content struct {
	Type string `json:"type"` // "text", "image_url", "input_audio"

	// Type == "text"
	Text string `json:"text,omitzero"`

	// Type == "image_url"
	ImageURL struct {
		URL string `json:"url,omitzero"`
	} `json:"image_url,omitzero"`

	InputAudio struct {
		Data   []byte `json:"data,omitzero"`
		Format string `json:"format,omitzero"` // "mp3", "wav"
	} `json:"input_audio,omitzero"`
}

Content is not documented.

You can look at how it's used in oaicompat_chat_params_parse() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/utils.hpp

func (*Content) FromReply ¶

func (c *Content) FromReply(in *genai.Reply) (bool, error)

func (*Content) FromRequest ¶

func (c *Content) FromRequest(in *genai.Request) (bool, error)

func (*Content) To ¶

func (c *Content) To(out *genai.Reply) error

type Contents ¶

type Contents []Content

func (*Contents) UnmarshalJSON ¶

func (c *Contents) UnmarshalJSON(b []byte) error

UnmarshalJSON implements custom unmarshalling for Contents type to handle cases where content could be a string or []Content.

type ErrorResponse ¶

type ErrorResponse struct {
	ErrorVal struct {
		Code    int64  `json:"code"`
		Message string `json:"message"`
		Type    string `json:"type"`
	} `json:"error"`
}

func (*ErrorResponse) Error ¶

func (er *ErrorResponse) Error() string

func (*ErrorResponse) IsAPIError ¶

func (er *ErrorResponse) IsAPIError() bool

type FinishReason ¶

type FinishReason string

const (
	FinishedStop      FinishReason = "stop"
	FinishedLength    FinishReason = "length"
	FinishedToolCalls FinishReason = "tool_calls"
)

func (FinishReason) ToFinishReason ¶

func (f FinishReason) ToFinishReason() genai.FinishReason

type HealthResponse ¶

type HealthResponse struct {
	Status          string `json:"status"`
	SlotsIdle       int64  `json:"slots_idle"`
	SlotsProcessing int64  `json:"slots_processing"`
}

HealthResponse is documented at https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#get-health-returns-heath-check-result

type Logprobs ¶

type Logprobs struct {
	Content []struct {
		ID          int64   `json:"id"`
		Token       string  `json:"token"`
		Bytes       []byte  `json:"bytes"`
		Logprob     float64 `json:"logprob"`
		TopLogprobs []struct {
			ID      int64   `json:"id"`
			Token   string  `json:"token"`
			Bytes   []byte  `json:"bytes"`
			Logprob float64 `json:"logprob"`
		} `json:"top_logprobs"`
	} `json:"content"`
}

func (*Logprobs) To ¶

func (l *Logprobs) To() [][]genai.Logprob

type Lora ¶

type Lora struct {
	ID    int64   `json:"id,omitzero"`
	Scale float64 `json:"scale,omitzero"`
}

type Message ¶

type Message struct {
	Role             string     `json:"role"` // "system", "assistant", "user", "tool"
	Content          Contents   `json:"content,omitzero"`
	ToolCalls        []ToolCall `json:"tool_calls,omitzero"`
	ReasoningContent string     `json:"reasoning_content,omitzero"`
	Name             string     `json:"name,omitzero"`
	ToolCallID       string     `json:"tool_call_id,omitzero"`
}

Message is not documented.

You can look at how it's used in oaicompat_chat_params_parse() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/utils.hpp and common_chat_msgs_parse_oaicompat() in https://github.com/ggml-org/llama.cpp/blob/master/common/chat.cpp

func (*Message) From ¶

func (m *Message) From(in *genai.Message) error

From must be called with at most one ToolCallResults.

func (*Message) To ¶

func (m *Message) To(out *genai.Message) error

type Metrics ¶

type Metrics struct {
	Prompt             TokenPerformance
	Generated          TokenPerformance
	KVCacheUsage       float64
	KVCacheTokens      int
	RequestsProcessing int
	RequestedPending   int
}

Metrics represents the metrics for the LLM server.

type Model ¶

type Model struct {
	HF     ModelHF
	OpenAI ModelOpenAI
}

Model is a synthetic struct combining the information from both ModelHF and ModelOpenAI

func (*Model) Context ¶

func (m *Model) Context() int64

func (*Model) GetID ¶

func (m *Model) GetID() string

GetID implements genai.Model.

It returns the base path name otherwise it's overwhelming and breaks our test cases.

func (*Model) String ¶

func (m *Model) String() string

type ModelHF ¶

type ModelHF struct {
	Name         string   `json:"name"`         // Path to the file
	Model        string   `json:"model"`        // Path to the file
	ModifiedAt   string   `json:"modified_at"`  // Dummy
	Size         string   `json:"size"`         // Dummy
	Digest       string   `json:"digest"`       // Dummy
	Type         string   `json:"type"`         // "model"
	Description  string   `json:"description"`  // Dummy
	Tags         []string `json:"tags"`         // Dummy
	Capabilities []string `json:"capabilities"` // "completion" (hardcoded)
	Parameters   string   `json:"parameters"`   // Dummy
	Details      struct {
		ParentModel       string   `json:"parent_model"`       // Dummy
		Format            string   `json:"format"`             // "gguf" (hardcoded)
		Family            string   `json:"family"`             // Dummy
		Families          []string `json:"families"`           // Dummy
		ParameterSize     string   `json:"parameter_size"`     // Dummy
		QuantizationLevel string   `json:"quantization_level"` // Dummy
	} `json:"details"`
}

type ModelOpenAI ¶

type ModelOpenAI struct {
	ID      string    `json:"id"`       // Path to the file
	Object  string    `json:"object"`   // "model"
	Created base.Time `json:"created"`  // Dummy
	OwnedBy string    `json:"owned_by"` // "llamacpp"
	Meta    struct {
		VocabType int64 `json:"vocab_type"` // 1
		NVocab    int64 `json:"n_vocab"`
		NCtxTrain int64 `json:"n_ctx_train"`
		NEmbd     int64 `json:"n_embd"`
		NParams   int64 `json:"n_params"`
		Size      int64 `json:"size"`
	} `json:"meta"`
}

type ModelsResponse ¶

type ModelsResponse struct {
	Models []ModelHF     `json:"models"`
	Object string        `json:"object"` // "list"
	Data   []ModelOpenAI `json:"data"`
}

ModelsResponse is not documented.

See handle_models() in https://github.com/ggml-org/llama.cpp/blob/master/tools/server/server.cpp

func (*ModelsResponse) ToModels ¶

func (m *ModelsResponse) ToModels() []genai.Model

type PromptEncoding ¶

type PromptEncoding struct {
	// Prompt encoding.
	BeginOfText              string `yaml:"begin_of_text"`
	SystemTokenStart         string `yaml:"system_token_start"`
	SystemTokenEnd           string `yaml:"system_token_end"`
	UserTokenStart           string `yaml:"user_token_start"`
	UserTokenEnd             string `yaml:"user_token_end"`
	AssistantTokenStart      string `yaml:"assistant_token_start"`
	AssistantTokenEnd        string `yaml:"assistant_token_end"`
	ToolsAvailableTokenStart string `yaml:"tools_available_token_start"`
	ToolsAvailableTokenEnd   string `yaml:"tools_available_token_end"`
	ToolCallTokenStart       string `yaml:"tool_call_token_start"`
	ToolCallTokenEnd         string `yaml:"tool_call_token_end"`
	ToolCallResultTokenStart string `yaml:"tool_call_result_token_start"`
	ToolCallResultTokenEnd   string `yaml:"tool_call_result_token_end"`
	// contains filtered or unexported fields
}

PromptEncoding describes how to encode the prompt.

func (*PromptEncoding) Validate ¶

func (p *PromptEncoding) Validate() error

Validate checks for obvious errors in the fields.

type StopType ¶

type StopType string

const (
	StopEOS   StopType = "eos"
	StopLimit StopType = "limit"
	StopWord  StopType = "word"
)

func (StopType) ToFinishReason ¶

func (s StopType) ToFinishReason() genai.FinishReason

type Timings ¶

type Timings struct {
	PromptN             int64   `json:"prompt_n"`
	PromptMS            float64 `json:"prompt_ms"`
	PromptPerTokenMS    float64 `json:"prompt_per_token_ms"`
	PromptPerSecond     float64 `json:"prompt_per_second"`
	PredictedN          int64   `json:"predicted_n"`
	PredictedMS         float64 `json:"predicted_ms"`
	PredictedPerTokenMS float64 `json:"predicted_per_token_ms"`
	PredictedPerSecond  float64 `json:"predicted_per_second"`
}

type TokenPerformance ¶

type TokenPerformance struct {
	Count    int
	Duration time.Duration
}

TokenPerformance is the performance for the metrics

func (*TokenPerformance) Rate ¶

func (t *TokenPerformance) Rate() float64

Rate is the number of token per second.

func (*TokenPerformance) String ¶

func (t *TokenPerformance) String() string

type Tool ¶

type Tool struct {
	Type     string `json:"type"` // "function"
	Function struct {
		Name        string             `json:"name"`
		Description string             `json:"description"`
		Parameters  *jsonschema.Schema `json:"parameters"`
	} `json:"function"`
}

Tool is not documented.

It's purely handled by the chat templates, thus its real structure varies from model to model. See https://github.com/ggml-org/llama.cpp/blob/master/common/chat.cpp

type ToolCall ¶

type ToolCall struct {
	Type     string `json:"type"` // "function"
	Index    int64  `json:"index"`
	ID       string `json:"id,omitzero"`
	Function struct {
		Name      string `json:"name,omitzero"`
		Arguments string `json:"arguments,omitzero"`
	} `json:"function"`
}

ToolCall is not documented.

You can look at how it's used in common_chat_msgs_parse_oaicompat() in https://github.com/ggml-org/llama.cpp/blob/master/common/chat.cpp

func (*ToolCall) From ¶

func (t *ToolCall) From(in *genai.ToolCall) error

func (*ToolCall) To ¶

func (t *ToolCall) To(out *genai.ToolCall)

type Usage ¶

type Usage struct {
	CompletionTokens int64 `json:"completion_tokens"`
	PromptTokens     int64 `json:"prompt_tokens"`
	TotalTokens      int64 `json:"total_tokens"`
}

Source Files ¶

View all Source files

client.go

Directories ¶

Path	Synopsis
llamacppsrv Package llamacppsrv downloads and starts llama-server from llama.cpp, directly from GitHub releases.	Package llamacppsrv downloads and starts llama-server from llama.cpp, directly from GitHub releases.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL