largedataset

package

v0.0.0-...-f2a1cc4 Latest Latest Go to latest Published: Dec 27, 2025 License: MIT Imports: 9 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/forgeronvirtuel/lab-golang

Links

Open Source Insights

Documentation ¶

Index ¶

func PrintSchemaAsCode(schema *CSVSchema, schemaName string) string
type Aggregator
type AmountStats
- func NewAmountStats() *AmountStats
- func (s *AmountStats) Add(row *LogicalRow)
- func (s *AmountStats) Average() float64
- func (s *AmountStats) HasData() bool
type CSVSchema
- func InferSchemaFromCSV(reader io.Reader, sep rune, hasHeader bool, config *SchemaInferenceConfig) (*CSVSchema, error)
- func NewStockDataSchema() *CSVSchema
- func (s *CSVSchema) ValidateRecord(record []string) error
type ColumnDef
type ColumnStats
type ColumnType
type CompositeAggregator
- func NewCompositeAggregator(aggs ...Aggregator) *CompositeAggregator
- func (c *CompositeAggregator) Consume(row *LogicalRow)
- func (c *CompositeAggregator) Report(w io.Writer)
type DebugAggregator
- func NewDebugAggregator(maxRows int) *DebugAggregator
- func (g *DebugAggregator) Consume(row *LogicalRow)
- func (g *DebugAggregator) Report(w io.Writer)
type Filter
- func ParseFilter(expr string, header []string) (*Filter, error)
- func (f *Filter) Evaluate(record []string) (bool, error)
- func (f *Filter) String() string
type FilterCondition
- func (fc *FilterCondition) Evaluate(record []string) (bool, error)
- func (fc *FilterCondition) String() string
type FilterExpression
type FilterGroup
- func (fg *FilterGroup) Evaluate(record []string) (bool, error)
- func (fg *FilterGroup) String() string
type FilterOperator
type FilterSet
- func NewFilterSet(filterExprs []string, header []string) (*FilterSet, error)
- func (fs *FilterSet) Evaluate(record []string) (bool, error)
- func (fs *FilterSet) String() string
type GlobalAmountAggregator
- func NewGlobalAmountAggregator() *GlobalAmountAggregator
- func (g *GlobalAmountAggregator) Consume(row *LogicalRow)
- func (g *GlobalAmountAggregator) Report(w io.Writer)
type GroupByAggregator
- func NewGroupByAggregator() *GroupByAggregator
- func (g *GroupByAggregator) Consume(row *LogicalRow)
- func (g *GroupByAggregator) Report(w io.Writer)
type LogicalOperator
type LogicalRow
- func ParseLogicalRow(record []string) (*LogicalRow, error)
- func ParseLogicalRowWithGroupBy(record []string, groupByIndex int) (*LogicalRow, error)
type SchemaInferenceConfig
- func DefaultInferenceConfig() *SchemaInferenceConfig
type ValidationError
- func (e *ValidationError) Error() string

Examples ¶

CSVSchema (Custom)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func PrintSchemaAsCode ¶

func PrintSchemaAsCode(schema *CSVSchema, schemaName string) string

PrintSchemaAsCode generates Go code for the inferred schema

Types ¶

type Aggregator ¶

type Aggregator interface {
	Consume(row *LogicalRow)
	Report(w io.Writer)
}

type AmountStats ¶

type AmountStats struct {
	Count int64
	Sum   float64
	Min   float64
	Max   float64
}

AmountStats holds simple streaming statistics for a numeric field.

func NewAmountStats ¶

func NewAmountStats() *AmountStats

NewAmountStats creates a new AmountStats with proper initial values.

func (*AmountStats) Add ¶

func (s *AmountStats) Add(row *LogicalRow)

Add updates the stats with a new logical row.

func (*AmountStats) Average ¶

func (s *AmountStats) Average() float64

Average returns the average amount or NaN if there is no data.

func (*AmountStats) HasData ¶

func (s *AmountStats) HasData() bool

HasData returns true if at least one value has been added.

type CSVSchema ¶

type CSVSchema struct {
	Columns       []ColumnDef
	MinColumns    int  // Minimum number of columns required
	StrictColumns bool // If true, reject rows with extra columns
}

CSVSchema represents the complete schema for a CSV file

Example (Custom) ¶

Example: Creating a custom CSV schema for user data

minAge := 18.0
maxAge := 120.0

userSchema := &CSVSchema{
	MinColumns:    5,
	StrictColumns: true,
	Columns: []ColumnDef{
		{
			Index:    0,
			Name:     "UserID",
			Type:     TypeInt,
			Required: true,
			Min:      &minAge, // reusing variable, means >= 18
		},
		{
			Index:     1,
			Name:      "Email",
			Type:      TypeEmail,
			Required:  true,
			MaxLength: 100,
		},
		{
			Index:    2,
			Name:     "Age",
			Type:     TypeInt,
			Required: true,
			Min:      &minAge,
			Max:      &maxAge,
		},
		{
			Index:       3,
			Name:        "Status",
			Type:        TypeString,
			Required:    true,
			AllowedVals: []string{"active", "inactive", "pending"},
		},
		{
			Index:      4,
			Name:       "JoinDate",
			Type:       TypeDate,
			Required:   true,
			DateFormat: "2006-01-02",
		},
	},
}

// Validate a record
validRecord := []string{"1001", "[email protected]", "25", "active", "2023-01-15"}
if err := userSchema.ValidateRecord(validRecord); err != nil {
	panic(err)
}

// This will fail validation
invalidRecord := []string{"1002", "invalid-email", "15", "unknown", "2023-01-15"}
_ = userSchema.ValidateRecord(invalidRecord) // Returns error

func InferSchemaFromCSV ¶

func InferSchemaFromCSV(reader io.Reader, sep rune, hasHeader bool, config *SchemaInferenceConfig) (*CSVSchema, error)

InferSchemaFromCSV analyzes a CSV file and infers its schema

func NewStockDataSchema ¶

func NewStockDataSchema() *CSVSchema

NewStockDataSchema creates a schema for the generated stock market CSV

func (*CSVSchema) ValidateRecord ¶

func (s *CSVSchema) ValidateRecord(record []string) error

ValidateRecord validates a CSV record against the schema

type ColumnDef ¶

type ColumnDef struct {
	Index       int        // Column index (0-based)
	Name        string     // Column name (for documentation/errors)
	Type        ColumnType // Expected data type
	Required    bool       // Whether the column must be non-empty
	MinLength   int        // Minimum string length (for TypeString)
	MaxLength   int        // Maximum string length (for TypeString)
	Min         *float64   // Minimum value (for TypeInt/TypeFloat)
	Max         *float64   // Maximum value (for TypeInt/TypeFloat)
	Pattern     string     // Regex pattern (for TypeRegex)
	DateFormat  string     // Date format (for TypeDate/TypeDateTime)
	AllowedVals []string   // Whitelist of allowed values
}

ColumnDef defines the schema for a single CSV column

type ColumnStats ¶

type ColumnStats struct {
	Index          int
	Name           string
	TotalValues    int
	EmptyValues    int
	UniqueValues   map[string]bool
	NumericValues  int
	IntegerValues  int
	FloatValues    int
	BoolValues     int
	DateValues     int
	DateTimeValues int
	EmailValues    int
	MinNumeric     float64
	MaxNumeric     float64
	MinLength      int
	MaxLength      int
	SampleValues   []string // Keep first few samples
}

ColumnStats holds statistics gathered during schema inference

type ColumnType ¶

type ColumnType int

ColumnType represents the expected data type of a CSV column

const (
	TypeString ColumnType = iota
	TypeInt
	TypeFloat
	TypeBool
	TypeDate
	TypeDateTime
	TypeEmail
	TypeRegex
)

type CompositeAggregator ¶

type CompositeAggregator struct {
	// contains filtered or unexported fields
}

func NewCompositeAggregator ¶

func NewCompositeAggregator(aggs ...Aggregator) *CompositeAggregator

func (*CompositeAggregator) Consume ¶

func (c *CompositeAggregator) Consume(row *LogicalRow)

func (*CompositeAggregator) Report ¶

func (c *CompositeAggregator) Report(w io.Writer)

type DebugAggregator ¶

type DebugAggregator struct {
	// contains filtered or unexported fields
}

func NewDebugAggregator ¶

func NewDebugAggregator(maxRows int) *DebugAggregator

func (*DebugAggregator) Consume ¶

func (g *DebugAggregator) Consume(row *LogicalRow)

func (*DebugAggregator) Report ¶

func (g *DebugAggregator) Report(w io.Writer)

type Filter ¶

type Filter struct {
	ColumnIndex int            // Column index to filter on
	ColumnName  string         // Column name (for display)
	Operator    FilterOperator // Comparison operator
	Value       string         // Value to compare against
	NumValue    float64        // Parsed numeric value (for numeric comparisons)
	IsNumeric   bool           // Whether this is a numeric comparison
	RegexPat    *regexp.Regexp // Compiled regex pattern (for regex operator)
}

Filter represents a single filter condition

func ParseFilter ¶

func ParseFilter(expr string, header []string) (*Filter, error)

ParseFilter parses a filter expression like "amount > 100" or "symbol = 'AAPL'"

func (*Filter) Evaluate ¶

func (f *Filter) Evaluate(record []string) (bool, error)

Evaluate checks if a record matches the filter

func (*Filter) String ¶

func (f *Filter) String() string

String returns a human-readable representation of the filter

type FilterCondition ¶

type FilterCondition struct {
	Filter *Filter
}

FilterCondition is a leaf node (single filter)

func (*FilterCondition) Evaluate ¶

func (fc *FilterCondition) Evaluate(record []string) (bool, error)

Evaluate for FilterCondition

func (*FilterCondition) String ¶

func (fc *FilterCondition) String() string

String for FilterCondition

type FilterExpression ¶

type FilterExpression interface {
	Evaluate(record []string) (bool, error)
	String() string
}

FilterExpression represents a filter expression tree

type FilterGroup struct {
	Operator    LogicalOperator
	Expressions []FilterExpression
}

FilterGroup is a composite node (multiple expressions with AND/OR)

func (fg *FilterGroup) Evaluate(record []string) (bool, error)

Evaluate for FilterGroup

func (fg *FilterGroup) String() string

String for FilterGroup

type FilterOperator ¶

type FilterOperator int

FilterOperator represents a comparison operator

const (
	OpEqual FilterOperator = iota
	OpNotEqual
	OpGreaterThan
	OpGreaterThanOrEqual
	OpLessThan
	OpLessThanOrEqual
	OpContains
	OpStartsWith
	OpEndsWith
	OpRegex
)

type FilterSet ¶

type FilterSet struct {
	Filters []*Filter        // Deprecated: kept for backward compatibility
	Root    FilterExpression // Root of the expression tree
}

FilterSet represents a collection of filters with support for AND/OR logic

func NewFilterSet ¶

func NewFilterSet(filterExprs []string, header []string) (*FilterSet, error)

NewFilterSet creates a new filter set from multiple filter expressions Supports both simple (backward compatible) and complex (with AND/OR) expressions

func (*FilterSet) Evaluate ¶

func (fs *FilterSet) Evaluate(record []string) (bool, error)

Evaluate checks if a record matches the filter expression tree

func (*FilterSet) String ¶

func (fs *FilterSet) String() string

String returns a human-readable representation of the filter set

type GlobalAmountAggregator ¶

type GlobalAmountAggregator struct {
	Stats *AmountStats
}

func NewGlobalAmountAggregator ¶

func NewGlobalAmountAggregator() *GlobalAmountAggregator

func (*GlobalAmountAggregator) Consume ¶

func (g *GlobalAmountAggregator) Consume(row *LogicalRow)

func (*GlobalAmountAggregator) Report ¶

func (g *GlobalAmountAggregator) Report(w io.Writer)

type GroupByAggregator ¶

type GroupByAggregator struct {
	// contains filtered or unexported fields
}

func NewGroupByAggregator ¶

func NewGroupByAggregator() *GroupByAggregator

func (*GroupByAggregator) Consume ¶

func (g *GroupByAggregator) Consume(row *LogicalRow)

func (*GroupByAggregator) Report ¶

func (g *GroupByAggregator) Report(w io.Writer)

type LogicalOperator ¶

type LogicalOperator int

LogicalOperator represents AND or OR

const (
	LogicalAND LogicalOperator = iota
	LogicalOR
)

type LogicalRow ¶

type LogicalRow struct {
	RawRecord []string // keep the original record if needed
	Amount    float64  // parsed numeric column
	GroupKey  string   // optional group-by key
}

LogicalRow represents a cleaned / typed version of a CSV row. For now we only care about a single numeric column: amount.

func ParseLogicalRow ¶

func ParseLogicalRow(record []string) (*LogicalRow, error)

func ParseLogicalRowWithGroupBy ¶

func ParseLogicalRowWithGroupBy(record []string, groupByIndex int) (*LogicalRow, error)

type SchemaInferenceConfig ¶

type SchemaInferenceConfig struct {
	SampleSize       int     // Number of rows to analyze (0 = all)
	MinConfidence    float64 // Minimum confidence to infer type (0.0-1.0)
	MaxUniqueForEnum int     // Max unique values to consider as enum
	SampleCount      int     // Number of sample values to keepz
}

SchemaInferenceConfig configures the schema inference process

func DefaultInferenceConfig ¶

func DefaultInferenceConfig() *SchemaInferenceConfig

DefaultInferenceConfig returns default configuration for schema inference

type ValidationError ¶

type ValidationError struct {
	Column   int
	ColName  string
	Value    string
	Expected string
}

ValidationError represents a validation error for a specific column

func (*ValidationError) Error ¶

func (e *ValidationError) Error() string

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL