largedataset

package
v0.0.0-...-f2a1cc4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 27, 2025 License: MIT Imports: 9 Imported by: 0

Documentation

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func PrintSchemaAsCode

func PrintSchemaAsCode(schema *CSVSchema, schemaName string) string

PrintSchemaAsCode generates Go code for the inferred schema

Types

type Aggregator

type Aggregator interface {
	Consume(row *LogicalRow)
	Report(w io.Writer)
}

type AmountStats

type AmountStats struct {
	Count int64
	Sum   float64
	Min   float64
	Max   float64
}

AmountStats holds simple streaming statistics for a numeric field.

func NewAmountStats

func NewAmountStats() *AmountStats

NewAmountStats creates a new AmountStats with proper initial values.

func (*AmountStats) Add

func (s *AmountStats) Add(row *LogicalRow)

Add updates the stats with a new logical row.

func (*AmountStats) Average

func (s *AmountStats) Average() float64

Average returns the average amount or NaN if there is no data.

func (*AmountStats) HasData

func (s *AmountStats) HasData() bool

HasData returns true if at least one value has been added.

type CSVSchema

type CSVSchema struct {
	Columns       []ColumnDef
	MinColumns    int  // Minimum number of columns required
	StrictColumns bool // If true, reject rows with extra columns
}

CSVSchema represents the complete schema for a CSV file

Example (Custom)

Example: Creating a custom CSV schema for user data

minAge := 18.0
maxAge := 120.0

userSchema := &CSVSchema{
	MinColumns:    5,
	StrictColumns: true,
	Columns: []ColumnDef{
		{
			Index:    0,
			Name:     "UserID",
			Type:     TypeInt,
			Required: true,
			Min:      &minAge, // reusing variable, means >= 18
		},
		{
			Index:     1,
			Name:      "Email",
			Type:      TypeEmail,
			Required:  true,
			MaxLength: 100,
		},
		{
			Index:    2,
			Name:     "Age",
			Type:     TypeInt,
			Required: true,
			Min:      &minAge,
			Max:      &maxAge,
		},
		{
			Index:       3,
			Name:        "Status",
			Type:        TypeString,
			Required:    true,
			AllowedVals: []string{"active", "inactive", "pending"},
		},
		{
			Index:      4,
			Name:       "JoinDate",
			Type:       TypeDate,
			Required:   true,
			DateFormat: "2006-01-02",
		},
	},
}

// Validate a record
validRecord := []string{"1001", "[email protected]", "25", "active", "2023-01-15"}
if err := userSchema.ValidateRecord(validRecord); err != nil {
	panic(err)
}

// This will fail validation
invalidRecord := []string{"1002", "invalid-email", "15", "unknown", "2023-01-15"}
_ = userSchema.ValidateRecord(invalidRecord) // Returns error

func InferSchemaFromCSV

func InferSchemaFromCSV(reader io.Reader, sep rune, hasHeader bool, config *SchemaInferenceConfig) (*CSVSchema, error)

InferSchemaFromCSV analyzes a CSV file and infers its schema

func NewStockDataSchema

func NewStockDataSchema() *CSVSchema

NewStockDataSchema creates a schema for the generated stock market CSV

func (*CSVSchema) ValidateRecord

func (s *CSVSchema) ValidateRecord(record []string) error

ValidateRecord validates a CSV record against the schema

type ColumnDef

type ColumnDef struct {
	Index       int        // Column index (0-based)
	Name        string     // Column name (for documentation/errors)
	Type        ColumnType // Expected data type
	Required    bool       // Whether the column must be non-empty
	MinLength   int        // Minimum string length (for TypeString)
	MaxLength   int        // Maximum string length (for TypeString)
	Min         *float64   // Minimum value (for TypeInt/TypeFloat)
	Max         *float64   // Maximum value (for TypeInt/TypeFloat)
	Pattern     string     // Regex pattern (for TypeRegex)
	DateFormat  string     // Date format (for TypeDate/TypeDateTime)
	AllowedVals []string   // Whitelist of allowed values
}

ColumnDef defines the schema for a single CSV column

type ColumnStats

type ColumnStats struct {
	Index          int
	Name           string
	TotalValues    int
	EmptyValues    int
	UniqueValues   map[string]bool
	NumericValues  int
	IntegerValues  int
	FloatValues    int
	BoolValues     int
	DateValues     int
	DateTimeValues int
	EmailValues    int
	MinNumeric     float64
	MaxNumeric     float64
	MinLength      int
	MaxLength      int
	SampleValues   []string // Keep first few samples
}

ColumnStats holds statistics gathered during schema inference

type ColumnType

type ColumnType int

ColumnType represents the expected data type of a CSV column

const (
	TypeString ColumnType = iota
	TypeInt
	TypeFloat
	TypeBool
	TypeDate
	TypeDateTime
	TypeEmail
	TypeRegex
)

type CompositeAggregator

type CompositeAggregator struct {
	// contains filtered or unexported fields
}

func NewCompositeAggregator

func NewCompositeAggregator(aggs ...Aggregator) *CompositeAggregator

func (*CompositeAggregator) Consume

func (c *CompositeAggregator) Consume(row *LogicalRow)

func (*CompositeAggregator) Report

func (c *CompositeAggregator) Report(w io.Writer)

type DebugAggregator

type DebugAggregator struct {
	// contains filtered or unexported fields
}

func NewDebugAggregator

func NewDebugAggregator(maxRows int) *DebugAggregator

func (*DebugAggregator) Consume

func (g *DebugAggregator) Consume(row *LogicalRow)

func (*DebugAggregator) Report

func (g *DebugAggregator) Report(w io.Writer)

type Filter

type Filter struct {
	ColumnIndex int            // Column index to filter on
	ColumnName  string         // Column name (for display)
	Operator    FilterOperator // Comparison operator
	Value       string         // Value to compare against
	NumValue    float64        // Parsed numeric value (for numeric comparisons)
	IsNumeric   bool           // Whether this is a numeric comparison
	RegexPat    *regexp.Regexp // Compiled regex pattern (for regex operator)
}

Filter represents a single filter condition

func ParseFilter

func ParseFilter(expr string, header []string) (*Filter, error)

ParseFilter parses a filter expression like "amount > 100" or "symbol = 'AAPL'"

func (*Filter) Evaluate

func (f *Filter) Evaluate(record []string) (bool, error)

Evaluate checks if a record matches the filter

func (*Filter) String

func (f *Filter) String() string

String returns a human-readable representation of the filter

type FilterCondition

type FilterCondition struct {
	Filter *Filter
}

FilterCondition is a leaf node (single filter)

func (*FilterCondition) Evaluate

func (fc *FilterCondition) Evaluate(record []string) (bool, error)

Evaluate for FilterCondition

func (*FilterCondition) String

func (fc *FilterCondition) String() string

String for FilterCondition

type FilterExpression

type FilterExpression interface {
	Evaluate(record []string) (bool, error)
	String() string
}

FilterExpression represents a filter expression tree

type FilterGroup

type FilterGroup struct {
	Operator    LogicalOperator
	Expressions []FilterExpression
}

FilterGroup is a composite node (multiple expressions with AND/OR)

func (*FilterGroup) Evaluate

func (fg *FilterGroup) Evaluate(record []string) (bool, error)

Evaluate for FilterGroup

func (*FilterGroup) String

func (fg *FilterGroup) String() string

String for FilterGroup

type FilterOperator

type FilterOperator int

FilterOperator represents a comparison operator

const (
	OpEqual FilterOperator = iota
	OpNotEqual
	OpGreaterThan
	OpGreaterThanOrEqual
	OpLessThan
	OpLessThanOrEqual
	OpContains
	OpStartsWith
	OpEndsWith
	OpRegex
)

type FilterSet

type FilterSet struct {
	Filters []*Filter        // Deprecated: kept for backward compatibility
	Root    FilterExpression // Root of the expression tree
}

FilterSet represents a collection of filters with support for AND/OR logic

func NewFilterSet

func NewFilterSet(filterExprs []string, header []string) (*FilterSet, error)

NewFilterSet creates a new filter set from multiple filter expressions Supports both simple (backward compatible) and complex (with AND/OR) expressions

func (*FilterSet) Evaluate

func (fs *FilterSet) Evaluate(record []string) (bool, error)

Evaluate checks if a record matches the filter expression tree

func (*FilterSet) String

func (fs *FilterSet) String() string

String returns a human-readable representation of the filter set

type GlobalAmountAggregator

type GlobalAmountAggregator struct {
	Stats *AmountStats
}

func NewGlobalAmountAggregator

func NewGlobalAmountAggregator() *GlobalAmountAggregator

func (*GlobalAmountAggregator) Consume

func (g *GlobalAmountAggregator) Consume(row *LogicalRow)

func (*GlobalAmountAggregator) Report

func (g *GlobalAmountAggregator) Report(w io.Writer)

type GroupByAggregator

type GroupByAggregator struct {
	// contains filtered or unexported fields
}

func NewGroupByAggregator

func NewGroupByAggregator() *GroupByAggregator

func (*GroupByAggregator) Consume

func (g *GroupByAggregator) Consume(row *LogicalRow)

func (*GroupByAggregator) Report

func (g *GroupByAggregator) Report(w io.Writer)

type LogicalOperator

type LogicalOperator int

LogicalOperator represents AND or OR

const (
	LogicalAND LogicalOperator = iota
	LogicalOR
)

type LogicalRow

type LogicalRow struct {
	RawRecord []string // keep the original record if needed
	Amount    float64  // parsed numeric column
	GroupKey  string   // optional group-by key
}

LogicalRow represents a cleaned / typed version of a CSV row. For now we only care about a single numeric column: amount.

func ParseLogicalRow

func ParseLogicalRow(record []string) (*LogicalRow, error)

func ParseLogicalRowWithGroupBy

func ParseLogicalRowWithGroupBy(record []string, groupByIndex int) (*LogicalRow, error)

type SchemaInferenceConfig

type SchemaInferenceConfig struct {
	SampleSize       int     // Number of rows to analyze (0 = all)
	MinConfidence    float64 // Minimum confidence to infer type (0.0-1.0)
	MaxUniqueForEnum int     // Max unique values to consider as enum
	SampleCount      int     // Number of sample values to keepz
}

SchemaInferenceConfig configures the schema inference process

func DefaultInferenceConfig

func DefaultInferenceConfig() *SchemaInferenceConfig

DefaultInferenceConfig returns default configuration for schema inference

type ValidationError

type ValidationError struct {
	Column   int
	ColName  string
	Value    string
	Expected string
}

ValidationError represents a validation error for a specific column

func (*ValidationError) Error

func (e *ValidationError) Error() string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL