Documentation
¶
Index ¶
- func PrintSchemaAsCode(schema *CSVSchema, schemaName string) string
- type Aggregator
- type AmountStats
- type CSVSchema
- type ColumnDef
- type ColumnStats
- type ColumnType
- type CompositeAggregator
- type DebugAggregator
- type Filter
- type FilterCondition
- type FilterExpression
- type FilterGroup
- type FilterOperator
- type FilterSet
- type GlobalAmountAggregator
- type GroupByAggregator
- type LogicalOperator
- type LogicalRow
- type SchemaInferenceConfig
- type ValidationError
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func PrintSchemaAsCode ¶
PrintSchemaAsCode generates Go code for the inferred schema
Types ¶
type Aggregator ¶
type Aggregator interface {
Consume(row *LogicalRow)
Report(w io.Writer)
}
type AmountStats ¶
AmountStats holds simple streaming statistics for a numeric field.
func NewAmountStats ¶
func NewAmountStats() *AmountStats
NewAmountStats creates a new AmountStats with proper initial values.
func (*AmountStats) Add ¶
func (s *AmountStats) Add(row *LogicalRow)
Add updates the stats with a new logical row.
func (*AmountStats) Average ¶
func (s *AmountStats) Average() float64
Average returns the average amount or NaN if there is no data.
func (*AmountStats) HasData ¶
func (s *AmountStats) HasData() bool
HasData returns true if at least one value has been added.
type CSVSchema ¶
type CSVSchema struct {
Columns []ColumnDef
MinColumns int // Minimum number of columns required
StrictColumns bool // If true, reject rows with extra columns
}
CSVSchema represents the complete schema for a CSV file
Example (Custom) ¶
Example: Creating a custom CSV schema for user data
minAge := 18.0
maxAge := 120.0
userSchema := &CSVSchema{
MinColumns: 5,
StrictColumns: true,
Columns: []ColumnDef{
{
Index: 0,
Name: "UserID",
Type: TypeInt,
Required: true,
Min: &minAge, // reusing variable, means >= 18
},
{
Index: 1,
Name: "Email",
Type: TypeEmail,
Required: true,
MaxLength: 100,
},
{
Index: 2,
Name: "Age",
Type: TypeInt,
Required: true,
Min: &minAge,
Max: &maxAge,
},
{
Index: 3,
Name: "Status",
Type: TypeString,
Required: true,
AllowedVals: []string{"active", "inactive", "pending"},
},
{
Index: 4,
Name: "JoinDate",
Type: TypeDate,
Required: true,
DateFormat: "2006-01-02",
},
},
}
// Validate a record
validRecord := []string{"1001", "[email protected]", "25", "active", "2023-01-15"}
if err := userSchema.ValidateRecord(validRecord); err != nil {
panic(err)
}
// This will fail validation
invalidRecord := []string{"1002", "invalid-email", "15", "unknown", "2023-01-15"}
_ = userSchema.ValidateRecord(invalidRecord) // Returns error
func InferSchemaFromCSV ¶
func InferSchemaFromCSV(reader io.Reader, sep rune, hasHeader bool, config *SchemaInferenceConfig) (*CSVSchema, error)
InferSchemaFromCSV analyzes a CSV file and infers its schema
func NewStockDataSchema ¶
func NewStockDataSchema() *CSVSchema
NewStockDataSchema creates a schema for the generated stock market CSV
func (*CSVSchema) ValidateRecord ¶
ValidateRecord validates a CSV record against the schema
type ColumnDef ¶
type ColumnDef struct {
Index int // Column index (0-based)
Name string // Column name (for documentation/errors)
Type ColumnType // Expected data type
Required bool // Whether the column must be non-empty
MinLength int // Minimum string length (for TypeString)
MaxLength int // Maximum string length (for TypeString)
Min *float64 // Minimum value (for TypeInt/TypeFloat)
Max *float64 // Maximum value (for TypeInt/TypeFloat)
Pattern string // Regex pattern (for TypeRegex)
DateFormat string // Date format (for TypeDate/TypeDateTime)
AllowedVals []string // Whitelist of allowed values
}
ColumnDef defines the schema for a single CSV column
type ColumnStats ¶
type ColumnStats struct {
Index int
Name string
TotalValues int
EmptyValues int
UniqueValues map[string]bool
NumericValues int
IntegerValues int
FloatValues int
BoolValues int
DateValues int
DateTimeValues int
EmailValues int
MinNumeric float64
MaxNumeric float64
MinLength int
MaxLength int
SampleValues []string // Keep first few samples
}
ColumnStats holds statistics gathered during schema inference
type ColumnType ¶
type ColumnType int
ColumnType represents the expected data type of a CSV column
const ( TypeString ColumnType = iota TypeInt TypeFloat TypeBool TypeDate TypeDateTime TypeEmail TypeRegex )
type CompositeAggregator ¶
type CompositeAggregator struct {
// contains filtered or unexported fields
}
func NewCompositeAggregator ¶
func NewCompositeAggregator(aggs ...Aggregator) *CompositeAggregator
func (*CompositeAggregator) Consume ¶
func (c *CompositeAggregator) Consume(row *LogicalRow)
func (*CompositeAggregator) Report ¶
func (c *CompositeAggregator) Report(w io.Writer)
type DebugAggregator ¶
type DebugAggregator struct {
// contains filtered or unexported fields
}
func NewDebugAggregator ¶
func NewDebugAggregator(maxRows int) *DebugAggregator
func (*DebugAggregator) Consume ¶
func (g *DebugAggregator) Consume(row *LogicalRow)
func (*DebugAggregator) Report ¶
func (g *DebugAggregator) Report(w io.Writer)
type Filter ¶
type Filter struct {
ColumnIndex int // Column index to filter on
ColumnName string // Column name (for display)
Operator FilterOperator // Comparison operator
Value string // Value to compare against
NumValue float64 // Parsed numeric value (for numeric comparisons)
IsNumeric bool // Whether this is a numeric comparison
RegexPat *regexp.Regexp // Compiled regex pattern (for regex operator)
}
Filter represents a single filter condition
func ParseFilter ¶
ParseFilter parses a filter expression like "amount > 100" or "symbol = 'AAPL'"
type FilterCondition ¶
type FilterCondition struct {
Filter *Filter
}
FilterCondition is a leaf node (single filter)
func (*FilterCondition) Evaluate ¶
func (fc *FilterCondition) Evaluate(record []string) (bool, error)
Evaluate for FilterCondition
func (*FilterCondition) String ¶
func (fc *FilterCondition) String() string
String for FilterCondition
type FilterExpression ¶
FilterExpression represents a filter expression tree
type FilterGroup ¶
type FilterGroup struct {
Operator LogicalOperator
Expressions []FilterExpression
}
FilterGroup is a composite node (multiple expressions with AND/OR)
type FilterOperator ¶
type FilterOperator int
FilterOperator represents a comparison operator
const ( OpEqual FilterOperator = iota OpNotEqual OpGreaterThan OpGreaterThanOrEqual OpLessThan OpLessThanOrEqual OpContains OpStartsWith OpEndsWith OpRegex )
type FilterSet ¶
type FilterSet struct {
Filters []*Filter // Deprecated: kept for backward compatibility
Root FilterExpression // Root of the expression tree
}
FilterSet represents a collection of filters with support for AND/OR logic
func NewFilterSet ¶
NewFilterSet creates a new filter set from multiple filter expressions Supports both simple (backward compatible) and complex (with AND/OR) expressions
type GlobalAmountAggregator ¶
type GlobalAmountAggregator struct {
Stats *AmountStats
}
func NewGlobalAmountAggregator ¶
func NewGlobalAmountAggregator() *GlobalAmountAggregator
func (*GlobalAmountAggregator) Consume ¶
func (g *GlobalAmountAggregator) Consume(row *LogicalRow)
func (*GlobalAmountAggregator) Report ¶
func (g *GlobalAmountAggregator) Report(w io.Writer)
type GroupByAggregator ¶
type GroupByAggregator struct {
// contains filtered or unexported fields
}
func NewGroupByAggregator ¶
func NewGroupByAggregator() *GroupByAggregator
func (*GroupByAggregator) Consume ¶
func (g *GroupByAggregator) Consume(row *LogicalRow)
func (*GroupByAggregator) Report ¶
func (g *GroupByAggregator) Report(w io.Writer)
type LogicalOperator ¶
type LogicalOperator int
LogicalOperator represents AND or OR
const ( LogicalAND LogicalOperator = iota LogicalOR )
type LogicalRow ¶
type LogicalRow struct {
RawRecord []string // keep the original record if needed
Amount float64 // parsed numeric column
GroupKey string // optional group-by key
}
LogicalRow represents a cleaned / typed version of a CSV row. For now we only care about a single numeric column: amount.
func ParseLogicalRow ¶
func ParseLogicalRow(record []string) (*LogicalRow, error)
func ParseLogicalRowWithGroupBy ¶
func ParseLogicalRowWithGroupBy(record []string, groupByIndex int) (*LogicalRow, error)
type SchemaInferenceConfig ¶
type SchemaInferenceConfig struct {
SampleSize int // Number of rows to analyze (0 = all)
MinConfidence float64 // Minimum confidence to infer type (0.0-1.0)
MaxUniqueForEnum int // Max unique values to consider as enum
SampleCount int // Number of sample values to keepz
}
SchemaInferenceConfig configures the schema inference process
func DefaultInferenceConfig ¶
func DefaultInferenceConfig() *SchemaInferenceConfig
DefaultInferenceConfig returns default configuration for schema inference
type ValidationError ¶
ValidationError represents a validation error for a specific column
func (*ValidationError) Error ¶
func (e *ValidationError) Error() string