diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index 4e4b7a7ee..c710f630f 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -7,7 +7,7 @@ The in-shell `help` command mirrors these feature categories: run `help` for a c ## Builtins -- ✅ `awk [-F SEP] [-v NAME=VALUE] ['PROGRAM'|-f PROGRAM-FILE] [FILE]...` — pattern scanning and text processing; Phase 1 supports BEGIN/main/END rules, read-only fields (`$0`, `$1`, `$NF`), `NF`/`NR`/`FNR`/`FILENAME`, `FS`/`OFS`/`ORS`, `print`, scalar assignment, arithmetic/comparison/boolean expressions, regex patterns and `~`/`!~`, and string concatenation; `system()`, command pipes, output redirection, `getline`, arrays, control flow, `printf`, regex `FS`, and field mutation are rejected or deferred +- ✅ `awk [-F SEP] [-v NAME=VALUE] ['PROGRAM'|-f PROGRAM-FILE] [FILE]...` — pattern scanning and text processing; supports BEGIN/main/END rules, read-only fields (`$0`, `$1`, `$NF`), `NF`/`NR`/`FNR`/`FILENAME`, `FS`/`OFS`/`ORS`, `print`, `printf`, scalar assignment, arithmetic/comparison/boolean expressions, regex patterns and `~`/`!~`, string concatenation, `if`/`else`, `next`, and scalar builtins (`length`, `substr`, `index`, `tolower`, `toupper`, `int`); `system()`, command pipes, output redirection, `getline`, arrays, loops, regex `FS`, and field mutation are rejected or deferred - ✅ `break` — exit the innermost `for` loop - ✅ `cat [-AbeEnstTuv] [FILE]...` — concatenate files to stdout; supports line numbering, blank squeezing, and non-printing character display - ✅ `continue` — skip to the next iteration of the innermost `for` loop diff --git a/analysis/symbols_builtins.go b/analysis/symbols_builtins.go index b39309523..f5d4f2a6d 100644 --- a/analysis/symbols_builtins.go +++ b/analysis/symbols_builtins.go @@ -30,21 +30,34 @@ var builtinPerCommandSymbols = map[string][]string{ "awk": { "bufio.NewScanner", // 🟢 line-by-line record reading; no write or exec capability. "context.Context", // 🟢 deadline/cancellation plumbing; pure interface, no side effects. + "errors.Is", // 🟢 error comparison; pure function, no I/O. + "errors.New", // 🟢 creates a simple error value; pure function, no I/O. "fmt.Errorf", // 🟢 error formatting; pure function, no I/O. + "fmt.Sprintf", // 🟢 string formatting for awk printf; pure function, no I/O. "io.EOF", // 🟢 sentinel error value; pure constant. "io.NopCloser", // 🟢 wraps a Reader with a no-op Close; no side effects. "io.ReadCloser", // 🟢 interface type; no side effects. + "math/big.Float", // 🟢 arbitrary-precision float type used to convert large awk printf integers; pure in-memory arithmetic. + "math/big.Int", // 🟢 arbitrary-precision integer type used for large awk printf integers; pure in-memory arithmetic. + "math/big.NewInt", // 🟢 constructs an in-memory integer value; pure function, no I/O. + "math.IsInf", // 🟢 IEEE 754 infinity check; pure function, no I/O. + "math.IsNaN", // 🟢 IEEE 754 NaN check; pure function, no I/O. "math.Mod", // 🟢 pure arithmetic modulo for awk % operator; no side effects. + "math.Trunc", // 🟢 pure arithmetic truncation for awk int(); no side effects. "os.O_RDONLY", // 🟢 read-only file flag constant; cannot open files by itself. "regexp.Compile", // 🟢 compiles a regular expression; pure function, no I/O. Uses RE2 engine (linear-time, no backtracking). "regexp.Regexp", // 🟢 compiled regular expression type; no I/O side effects. All matching methods are linear-time (RE2). "strconv.FormatFloat", // 🟢 float-to-string conversion for awk numeric output; pure function. "strconv.ParseFloat", // 🟢 string-to-float conversion; pure function, no I/O. "strings.Builder", // 🟢 efficient string concatenation; pure in-memory buffer, no I/O. + "strings.ContainsRune", // 🟢 checks if a rune is in a string; pure function, no I/O. "strings.Cut", // 🟢 splits a string around the first separator; pure function, no I/O. + "strings.Index", // 🟢 substring search for awk index(); pure function, no I/O. "strings.Join", // 🟢 concatenates a slice of strings with a separator; pure function, no I/O. "strings.NewReader", // 🟢 wraps a string as an io.Reader; pure in-memory, no I/O. "strings.Split", // 🟢 splits a string by separator into a slice; pure function, no I/O. + "strings.ToLower", // 🟢 converts string to lowercase for awk tolower(); pure function, no I/O. + "strings.ToUpper", // 🟢 converts string to uppercase for awk toupper(); pure function, no I/O. "strings.TrimSpace", // 🟢 removes leading/trailing whitespace; pure function. "unicode/utf8.DecodeRuneInString", // 🟢 decodes first UTF-8 rune from a string; pure function, no I/O. "unicode/utf8.RuneError", // 🟢 replacement character returned for invalid UTF-8; constant, no I/O. @@ -574,6 +587,9 @@ var builtinAllowedSymbols = []string{ "io/fs.ModeSticky", // 🟢 file mode bit constant for sticky bit; pure constant. "io/fs.ModeSymlink", // 🟢 file mode bit constant for symlinks; pure constant. "io/fs.ReadDirFile", // 🟢 read-only directory handle interface; no write capability. + "math/big.Float", // 🟢 arbitrary-precision float type; pure in-memory arithmetic. + "math/big.Int", // 🟢 arbitrary-precision integer type; pure in-memory arithmetic. + "math/big.NewInt", // 🟢 constructs an in-memory integer value; pure function, no I/O. "math.Ceil", // 🟢 pure arithmetic; no side effects. "math.Floor", // 🟢 pure arithmetic; no side effects. "math.Inf", // 🟢 returns positive or negative infinity; pure function, no I/O. @@ -585,6 +601,7 @@ var builtinAllowedSymbols = []string{ "math.MinInt64", // 🟢 integer constant; no side effects. "math.Mod", // 🟢 pure arithmetic modulo; no side effects. "math.NaN", // 🟢 returns IEEE 754 NaN value; pure function, no I/O. + "math.Trunc", // 🟢 pure arithmetic truncation toward zero; no side effects. "net.DefaultResolver", // 🔴 default system DNS resolver; used for context-aware address lookup; network I/O is the explicit purpose of the ping builtin. "net.FlagBroadcast", // 🟢 interface flag constant: broadcast capability; pure constant, no network connections. "net.IPAddr", // 🟢 resolved IP address struct (IP + Zone); pure data type, no I/O. @@ -639,12 +656,14 @@ var builtinAllowedSymbols = []string{ "strings.Cut", // 🟢 splits a string around the first separator; pure function, no I/O. "strings.Fields", // 🟢 splits a string on whitespace into a slice; pure function, no I/O. "strings.HasPrefix", // 🟢 pure function for prefix matching; no I/O. + "strings.Index", // 🟢 substring search; pure function, no I/O. "strings.IndexByte", // 🟢 finds byte in string; pure function, no I/O. "strings.Join", // 🟢 concatenates a slice of strings with a separator; pure function, no I/O. "strings.NewReader", // 🟢 wraps a string as an io.Reader; pure in-memory, no I/O. "strings.ReplaceAll", // 🟢 replaces all occurrences of a substring; pure function, no I/O. "strings.Split", // 🟢 splits a string by separator into a slice; pure function, no I/O. "strings.ToLower", // 🟢 converts string to lowercase; pure function, no I/O. + "strings.ToUpper", // 🟢 converts string to uppercase; pure function, no I/O. "strings.TrimPrefix", // 🟢 removes a leading prefix from a string; pure function, no I/O. "strings.TrimSpace", // 🟢 removes leading/trailing whitespace; pure function. "syscall.ByHandleFileInformation", // 🟢 Windows file info struct for extracting nlink; read-only type, no I/O. diff --git a/builtins/awk/ast.go b/builtins/awk/ast.go index 4afe00b56..d68bde506 100644 --- a/builtins/awk/ast.go +++ b/builtins/awk/ast.go @@ -33,6 +33,24 @@ type printStmt struct { func (*printStmt) stmtNode() {} +type printfStmt struct { + args []expr +} + +func (*printfStmt) stmtNode() {} + +type ifStmt struct { + cond expr + thenStmts []stmt + elseStmts []stmt +} + +func (*ifStmt) stmtNode() {} + +type nextStmt struct{} + +func (*nextStmt) stmtNode() {} + type exprStmt struct { x expr } @@ -110,3 +128,10 @@ type incDecExpr struct { } func (*incDecExpr) exprNode() {} + +type callExpr struct { + name string + args []expr +} + +func (*callExpr) exprNode() {} diff --git a/builtins/awk/awk.go b/builtins/awk/awk.go index 3a41fdc0a..c70d97efc 100644 --- a/builtins/awk/awk.go +++ b/builtins/awk/awk.go @@ -11,16 +11,17 @@ // // awk [OPTION]... -f program-file [FILE]... // -// Phase 1 implements a practical, intentionally restricted awk profile: -// program loading from an inline argument or -f files, -F one-character field -// separators, -v scalar variables, BEGIN/main/END rules, print, scalar -// assignment, arithmetic/comparison/boolean expressions, regex patterns and -// match operators, string concatenation, and read-only fields/built-in -// variables such as $0, $1, NF, NR, FNR, FILENAME, FS, OFS, and ORS. +// This implements a practical, intentionally restricted awk profile: program +// loading from an inline argument or -f files, -F one-character field +// separators, -v scalar variables, BEGIN/main/END rules, print and printf, +// scalar assignment, if/else, next, arithmetic/comparison/boolean expressions, +// regex patterns and match operators, string concatenation, scalar built-in +// functions, and read-only fields/built-in variables such as $0, $1, NF, NR, +// FNR, FILENAME, FS, OFS, and ORS. // // Blocked or deferred features include system(), command pipes, output -// redirection, getline, arrays, control flow statements, printf, user-defined -// functions, regex FS, and field mutation/$0 rebuilding. +// redirection, getline, arrays, loops, user-defined functions, regex FS, and +// field mutation/$0 rebuilding. package awk import ( diff --git a/builtins/awk/eval.go b/builtins/awk/eval.go index 41f069fd9..475f814dd 100644 --- a/builtins/awk/eval.go +++ b/builtins/awk/eval.go @@ -6,11 +6,14 @@ package awk import ( + "errors" "fmt" "math" "strings" ) +var errNextRecord = errors.New("next record") + func (rt *runtime) execStatements(stmts []stmt) error { for _, st := range stmts { switch s := st.(type) { @@ -30,6 +33,39 @@ func (rt *runtime) execStatements(stmts []stmt) error { if err := rt.printValues(vals); err != nil { return err } + case *printfStmt: + if len(s.args) == 0 { + return fmt.Errorf("printf requires a format expression") + } + vals := make([]value, 0, len(s.args)) + for _, arg := range s.args { + v, err := rt.eval(arg) + if err != nil { + return err + } + vals = append(vals, v) + } + out, err := formatPrintf(vals[0].String(), vals[1:]) + if err != nil { + return err + } + rt.callCtx.Out(out) + case *ifStmt: + cond, err := rt.eval(s.cond) + if err != nil { + return err + } + if cond.Bool() { + if err := rt.execStatements(s.thenStmts); err != nil { + return err + } + } else if len(s.elseStmts) > 0 { + if err := rt.execStatements(s.elseStmts); err != nil { + return err + } + } + case *nextStmt: + return errNextRecord case *exprStmt: if _, err := rt.eval(s.x); err != nil { return err @@ -41,6 +77,26 @@ func (rt *runtime) execStatements(stmts []stmt) error { return nil } +func substrStart(n float64, length int) int { + if n <= 1 || math.IsNaN(n) { + return 0 + } + if n > float64(length) { + return length + } + return int(n) - 1 +} + +func substrEnd(start, length int, count float64) int { + if count <= 0 || math.IsNaN(count) { + return start + } + if count >= float64(length-start) { + return length + } + return start + int(count) +} + func (rt *runtime) printValues(vals []value) error { parts := make([]string, len(vals)) for i, v := range vals { @@ -101,11 +157,68 @@ func (rt *runtime) eval(x expr) (value, error) { return rt.evalAssign(e) case *incDecExpr: return rt.evalIncDec(e) + case *callExpr: + return rt.evalCall(e) default: return value{}, fmt.Errorf("unknown expression") } } +func (rt *runtime) evalCall(e *callExpr) (value, error) { + args := make([]value, 0, len(e.args)) + for _, arg := range e.args { + v, err := rt.eval(arg) + if err != nil { + return value{}, err + } + args = append(args, v) + } + if err := validateBuiltinCallArity(e.name, len(args)); err != nil { + return value{}, err + } + switch e.name { + case "length": + s := rt.field(0).String() + if len(args) == 1 { + s = args[0].String() + } + return numberValue(float64(len([]rune(s)))), nil + case "substr": + s := []rune(args[0].String()) + start := substrStart(args[1].Number(), len(s)) + if start >= len(s) { + return stringValue(""), nil + } + end := len(s) + if len(args) == 3 { + end = substrEnd(start, len(s), args[2].Number()) + } + return stringValue(string(s[start:end])), nil + case "index": + haystack := args[0].String() + needle := args[1].String() + if needle == "" { + return numberValue(1), nil + } + pos := strings.Index(haystack, needle) + if pos < 0 { + return numberValue(0), nil + } + return numberValue(float64(len([]rune(haystack[:pos])) + 1)), nil + case "tolower": + s := args[0].String() + return stringValue(strings.ToLower(s)), nil + case "toupper": + s := args[0].String() + return stringValue(strings.ToUpper(s)), nil + case "int": + v := args[0] + return numberValue(math.Trunc(v.Number())), nil + default: + return value{}, fmt.Errorf("function calls are not supported") + } +} + func (rt *runtime) evalBinary(e *binaryExpr) (value, error) { if e.op == "&&" { left, err := rt.eval(e.left) diff --git a/builtins/awk/lexer.go b/builtins/awk/lexer.go index b9d9c1ef6..614d65488 100644 --- a/builtins/awk/lexer.go +++ b/builtins/awk/lexer.go @@ -322,7 +322,7 @@ func (l *lexer) scanRegex(start int) (token, error) { } func canStartRegex(prev tokenKind, prevLit string) bool { - if prev == tokIdent && prevLit == "print" { + if prev == tokIdent && (prevLit == "print" || prevLit == "printf") { return true } switch prev { diff --git a/builtins/awk/parser.go b/builtins/awk/parser.go index 16aeae1bd..a787c85b7 100644 --- a/builtins/awk/parser.go +++ b/builtins/awk/parser.go @@ -37,10 +37,7 @@ var unsupportedBuiltinFunctions = map[string]struct{}{ "fflush": {}, "gensub": {}, "gsub": {}, - "index": {}, - "int": {}, "isarray": {}, - "length": {}, "log": {}, "lshift": {}, "match": {}, @@ -57,15 +54,21 @@ var unsupportedBuiltinFunctions = map[string]struct{}{ "strftime": {}, "strtonum": {}, "sub": {}, - "substr": {}, "system": {}, "systime": {}, - "tolower": {}, - "toupper": {}, "typeof": {}, "xor": {}, } +var supportedBuiltinFunctions = map[string]struct{}{ + "index": {}, + "int": {}, + "length": {}, + "substr": {}, + "tolower": {}, + "toupper": {}, +} + type parser struct { toks []token pos int @@ -124,6 +127,10 @@ func (p *parser) parseAction() ([]stmt, error) { if !p.match(tokLBrace) { return nil, fmt.Errorf("expected action") } + return p.parseStatementList() +} + +func (p *parser) parseStatementList() ([]stmt, error) { stmts := []stmt{} p.skipSeparators() for !p.at(tokRBrace) { @@ -145,14 +152,21 @@ func (p *parser) parseAction() ([]stmt, error) { } func (p *parser) parseStatement() (stmt, error) { + if p.atIdent("if") { + return p.parseIf() + } + if p.atIdent("next") { + p.advance() + return &nextStmt{}, nil + } if p.atIdent("print") { return p.parsePrint() } if p.atIdent("printf") { - return nil, fmt.Errorf("printf is not supported") + return p.parsePrintf() } if p.atIdent("if") || p.atIdent("while") || p.atIdent("for") || - p.atIdent("next") || p.atIdent("nextfile") || p.atIdent("exit") || + p.atIdent("nextfile") || p.atIdent("exit") || p.atIdent("break") || p.atIdent("continue") { return nil, fmt.Errorf("control flow statements are not supported") } @@ -169,6 +183,52 @@ func (p *parser) parseStatement() (stmt, error) { return &exprStmt{x: x}, nil } +func (p *parser) parseIf() (stmt, error) { + p.advance() + if !p.match(tokLParen) { + return nil, fmt.Errorf("expected ( after if") + } + cond, err := p.parseExpression(0) + if err != nil { + return nil, err + } + if !p.match(tokRParen) { + return nil, fmt.Errorf("expected ) after if condition") + } + thenStmts, err := p.parseStatementGroup() + if err != nil { + return nil, err + } + save := p.pos + p.skipSeparators() + var elseStmts []stmt + if p.atIdent("else") { + p.advance() + elseStmts, err = p.parseStatementGroup() + if err != nil { + return nil, err + } + } else { + p.pos = save + } + return &ifStmt{cond: cond, thenStmts: thenStmts, elseStmts: elseStmts}, nil +} + +func (p *parser) parseStatementGroup() ([]stmt, error) { + p.skipNewlines() + if p.match(tokSemicolon) { + return nil, nil + } + if p.match(tokLBrace) { + return p.parseStatementList() + } + st, err := p.parseStatement() + if err != nil { + return nil, err + } + return []stmt{st}, nil +} + func (p *parser) parsePrint() (stmt, error) { p.advance() ps := &printStmt{} @@ -195,6 +255,53 @@ func (p *parser) parsePrint() (stmt, error) { return ps, nil } +func (p *parser) parsePrintf() (stmt, error) { + p.advance() + ps := &printfStmt{} + parenthesized := p.match(tokLParen) + if parenthesized { + p.skipSeparators() + } + if p.at(tokRBrace) || p.at(tokEOF) || isSeparator(p.cur().kind) || p.at(tokRParen) { + return nil, fmt.Errorf("printf requires a format expression") + } + old := p.stopPrintRedirect + p.stopPrintRedirect = true + defer func() { p.stopPrintRedirect = old }() + for { + x, err := p.parseExpression(0) + if err != nil { + return nil, err + } + ps.args = append(ps.args, x) + if p.at(tokGT) || p.at(tokAppend) || p.at(tokPipe) { + return nil, fmt.Errorf("print redirection and command pipes are not supported") + } + if parenthesized { + p.skipSeparators() + if p.match(tokRParen) { + break + } + if !p.match(tokComma) { + return nil, fmt.Errorf("expected , or ) in printf") + } + p.skipSeparators() + continue + } + if !p.match(tokComma) { + break + } + p.skipSeparators() + } + return ps, nil +} + +func (p *parser) skipNewlines() { + for p.at(tokNewline) { + p.advance() + } +} + func (p *parser) parseExpression(minPrec int) (expr, error) { left, err := p.parsePrefix() if err != nil { @@ -278,12 +385,15 @@ func (p *parser) parsePrefix() (expr, error) { return ®exExpr{pattern: tok.lit}, nil case tokIdent: p.advance() + if p.at(tokLParen) { + return p.parseFunctionCall(tok.lit) + } + if tok.lit == "length" { + return &callExpr{name: tok.lit}, nil + } if err := validateIdentifierReference(tok.lit); err != nil { return nil, err } - if p.at(tokLParen) { - return nil, fmt.Errorf("function calls are not supported") - } if p.at(tokLBracket) { return nil, fmt.Errorf("arrays are not supported") } @@ -325,6 +435,65 @@ func (p *parser) parsePrefix() (expr, error) { } } +func (p *parser) parseFunctionCall(name string) (expr, error) { + if _, ok := supportedBuiltinFunctions[name]; !ok { + if name == "system" { + return nil, fmt.Errorf("system() is not supported") + } + return nil, fmt.Errorf("function calls are not supported") + } + p.advance() + args := []expr{} + p.skipSeparators() + if p.match(tokRParen) { + if err := validateBuiltinCallArity(name, len(args)); err != nil { + return nil, err + } + return &callExpr{name: name}, nil + } + for { + p.skipSeparators() + arg, err := p.parseExpression(0) + if err != nil { + return nil, err + } + args = append(args, arg) + p.skipSeparators() + if p.match(tokRParen) { + break + } + if !p.match(tokComma) { + return nil, fmt.Errorf("expected , or ) in function call") + } + } + if err := validateBuiltinCallArity(name, len(args)); err != nil { + return nil, err + } + return &callExpr{name: name, args: args}, nil +} + +func validateBuiltinCallArity(name string, argc int) error { + switch name { + case "length": + if argc > 1 { + return fmt.Errorf("length expects at most 1 argument") + } + case "substr": + if argc != 2 && argc != 3 { + return fmt.Errorf("substr expects 2 or 3 arguments") + } + case "index": + if argc != 2 { + return fmt.Errorf("index expects 2 arguments") + } + case "tolower", "toupper", "int": + if argc != 1 { + return fmt.Errorf("%s expects 1 argument", name) + } + } + return nil +} + func validateIdentifierReference(name string) error { if msg, ok := unsupportedExpressionKeyword(name); ok { return fmt.Errorf("%s", msg) @@ -332,6 +501,9 @@ func validateIdentifierReference(name string) error { if name == "system" { return fmt.Errorf("system() is not supported") } + if _, ok := supportedBuiltinFunctions[name]; ok { + return fmt.Errorf("function calls are not supported") + } if _, ok := unsupportedBuiltinFunctions[name]; ok { return fmt.Errorf("function calls are not supported") } diff --git a/builtins/awk/parser_test.go b/builtins/awk/parser_test.go index 85cdbe4c9..32a725135 100644 --- a/builtins/awk/parser_test.go +++ b/builtins/awk/parser_test.go @@ -13,7 +13,7 @@ import ( ) func TestParsePracticalAwkProgram(t *testing.T) { - prog, err := parseProgram(`BEGIN { label = "sum=" } $2 > 1 { total += $2; print label total } END { print total }`) + prog, err := parseProgram(`BEGIN { label = "sum=" } $2 > 1 { if ($1 == "skip") next; total += $2; printf "%s%d\n", label, total } END { print length($0), total }`) require.NoError(t, err) require.Len(t, prog.rules, 3) assert.Equal(t, ruleBegin, prog.rules[0].kind) @@ -27,7 +27,6 @@ func TestParseRejectsUnsafeFeatures(t *testing.T) { `{ print $1 > "out" }`, `{ "cmd" | getline }`, `{ $1 = "x" }`, - `{ next; print $1 }`, `{ exit 1 }`, } { _, err := parseProgram(src) diff --git a/builtins/awk/printf.go b/builtins/awk/printf.go new file mode 100644 index 000000000..86fb0b320 --- /dev/null +++ b/builtins/awk/printf.go @@ -0,0 +1,175 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +import ( + "fmt" + "math" + "math/big" + "strings" + "unicode/utf8" +) + +const ( + MaxPrintfWidth = 1 << 20 + MaxPrintfPrecision = 1 << 20 + MaxPrintfOutput = 1 << 20 + + minInt64Float = -9223372036854775808.0 + maxInt64ExclusiveFloat = 9223372036854775808.0 + maxUint64Exclusive = 18446744073709551616.0 +) + +func formatPrintf(format string, args []value) (string, error) { + var b strings.Builder + arg := 0 + for i := 0; i < len(format); i++ { + if format[i] != '%' { + if err := appendPrintfByte(&b, format[i]); err != nil { + return "", err + } + continue + } + start := i + i++ + if i >= len(format) { + return "", fmt.Errorf("unterminated printf format") + } + if format[i] == '%' { + if err := appendPrintfByte(&b, '%'); err != nil { + return "", err + } + continue + } + for i < len(format) && strings.ContainsRune("-+ #0", rune(format[i])) { + i++ + } + if err := consumePrintfBound(format, &i, MaxPrintfWidth, "width"); err != nil { + return "", err + } + if i < len(format) && format[i] == '.' { + i++ + if err := consumePrintfBound(format, &i, MaxPrintfPrecision, "precision"); err != nil { + return "", err + } + } + if i >= len(format) { + return "", fmt.Errorf("unterminated printf format") + } + verb := format[i] + if verb == '*' { + return "", fmt.Errorf("dynamic printf width is not supported") + } + spec := format[start : i+1] + if arg >= len(args) { + return "", fmt.Errorf("not enough arguments for printf") + } + v := args[arg] + arg++ + var out string + switch verb { + case 's': + out = fmt.Sprintf(spec, v.String()) + case 'd', 'i': + if verb == 'i' { + spec = spec[:len(spec)-1] + "d" + } + out = fmt.Sprintf(spec, printfSigned(v)) + case 'u': + spec = spec[:len(spec)-1] + "d" + out = fmt.Sprintf(spec, printfUnsigned(v)) + case 'o', 'x', 'X': + out = fmt.Sprintf(spec, printfUnsigned(v)) + case 'e', 'E', 'f', 'F', 'g', 'G': + out = fmt.Sprintf(spec, v.Number()) + case 'c': + out = fmt.Sprintf(spec, printfRune(v)) + default: + return "", fmt.Errorf("unsupported printf format %%%c", verb) + } + if err := appendPrintfString(&b, out); err != nil { + return "", err + } + } + return b.String(), nil +} + +func appendPrintfByte(b *strings.Builder, c byte) error { + if b.Len() >= MaxPrintfOutput { + return fmt.Errorf("printf output exceeds %d bytes", MaxPrintfOutput) + } + b.WriteByte(c) + return nil +} + +func appendPrintfString(b *strings.Builder, s string) error { + if len(s) > MaxPrintfOutput-b.Len() { + return fmt.Errorf("printf output exceeds %d bytes", MaxPrintfOutput) + } + b.WriteString(s) + return nil +} + +func consumePrintfBound(format string, idx *int, max int, name string) error { + n := 0 + for *idx < len(format) && format[*idx] >= '0' && format[*idx] <= '9' { + digit := int(format[*idx] - '0') + if n > (max-digit)/10 { + return fmt.Errorf("printf %s exceeds %d", name, max) + } + n = n*10 + digit + (*idx)++ + } + if n > max { + return fmt.Errorf("printf %s exceeds %d", name, max) + } + return nil +} + +func printfSigned(v value) any { + n := v.Number() + if n >= minInt64Float && n < maxInt64ExclusiveFloat { + return int64(n) + } + return printfBigInt(n) +} + +func printfUnsigned(v value) any { + n := v.Number() + if n >= 0 && n < maxUint64Exclusive { + return uint64(n) + } + if n >= minInt64Float && n < 0 { + return uint64(int64(n)) + } + return printfBigInt(n) +} + +func printfBigInt(n float64) *big.Int { + if math.IsNaN(n) { + return big.NewInt(0) + } + if math.IsInf(n, 1) { + return new(big.Int).SetUint64(^uint64(0)) + } + if math.IsInf(n, -1) { + return big.NewInt(-9223372036854775807 - 1) + } + f := new(big.Float).SetPrec(64).SetFloat64(n) + i, _ := f.Int(nil) + if i == nil { + return big.NewInt(0) + } + return i +} + +func printfRune(v value) rune { + if v.kind == valueString && v.s != "" { + r, _ := utf8.DecodeRuneInString(v.s) + return r + } + return rune(int64(v.Number())) +} diff --git a/builtins/awk/runtime.go b/builtins/awk/runtime.go index 250c48906..9b399772c 100644 --- a/builtins/awk/runtime.go +++ b/builtins/awk/runtime.go @@ -8,6 +8,7 @@ package awk import ( "bufio" "context" + "errors" "fmt" "io" "os" @@ -303,6 +304,9 @@ func (rt *runtime) runFile(ctx context.Context, file string) error { rt.nr++ rt.fnr++ if err := rt.runRules(ctx, ruleNormal); err != nil { + if errors.Is(err, errNextRecord) { + continue + } return err } } @@ -365,6 +369,12 @@ func (rt *runtime) runRules(ctx context.Context, kind ruleKind) error { continue } if err := rt.execStatements(r.action); err != nil { + if errors.Is(err, errNextRecord) { + if kind == ruleNormal { + return err + } + return fmt.Errorf("next is not allowed in BEGIN or END") + } return err } } diff --git a/builtins/tests/awk/awk_test.go b/builtins/tests/awk/awk_test.go index 68b5cd88a..5dc156bc7 100644 --- a/builtins/tests/awk/awk_test.go +++ b/builtins/tests/awk/awk_test.go @@ -228,6 +228,25 @@ func TestAwkIntegerNumberFormatting(t *testing.T) { assert.Equal(t, "999999 1000000 123456789 1e+06\n0 0\n", stdout) } +func TestAwkIfNextPrintfAndScalarBuiltins(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a 1\nb 22\nskip 5\n") + stdout, stderr, code := cmdRun(t, `awk '{ if ($1 == "skip") next; if ($2 > 9) { printf "%s:%03d:%u\n", toupper($1), $2, 42 } else printf "small:%s:%d:%d:%d:%d:%s\n", tolower($1), int($2 + .9), length, index($0, $2), index($0, ""), substr($0, 2, 2) }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "small:a:1:3:3:1: 1\nB:022:42\n", stdout) + + stdout, stderr, code = cmdRun(t, "awk '{ if ($1 == \"skip\")\nnext\nelse\nprintf \"%s:%x\\n\", $1, -1 }' input.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "a:ffffffffffffffff\nb:ffffffffffffffff\n", stdout) + + stdout, stderr, code = cmdRun(t, `awk 'BEGIN { printf "%d|%u|%x|%o\n", 18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615 }'`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "18446744073709551616|18446744073709551616|10000000000000000|2000000000000000000000\n", stdout) +} + func TestAwkBeginOnlySkipsInputFiles(t *testing.T) { dir := t.TempDir() stdout, stderr, code := cmdRun(t, `awk 'BEGIN { print "x" }' missing.txt`, dir) @@ -305,14 +324,22 @@ func TestAwkRejectsNaNAndInfNumericStrings(t *testing.T) { func TestAwkRejectsUnsafeFeatures(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "input.txt", "a b\n") + writeFile(t, dir, "empty.txt", "") for _, script := range []string{ `awk '{ system("sh") }' input.txt`, `awk '{ print $1 > "out" }' input.txt`, + `awk '{ printf "%s", $1 > "out" }' input.txt`, `awk '{ $1 = "x" }' input.txt`, - `awk '{ next; print $1 }' input.txt`, `awk '{ print getline }' input.txt`, `awk '{ x = next }' input.txt`, `awk '{ exit 0 }' input.txt`, + `awk 'BEGIN { next }' input.txt`, + `awk 'BEGIN { print tolower(), toupper(), int() }' input.txt`, + `awk '{ print int() }' empty.txt`, + `awk '$1 == "missing" { print length(1, 2) }' input.txt`, + `awk 'BEGIN { printf "%1000000000s", "x" }' input.txt`, + `awk 'BEGIN { printf "%.1000000000s", "x" }' input.txt`, + `awk 'BEGIN { printf "%1048576s%1048576s", "x", "y" }' input.txt`, `awk 'BEGIN { BEGIN=1; print BEGIN }' input.txt`, `awk 'BEGIN { END=1; print END }' input.txt`, `awk '{ print $BEGIN }' input.txt`, @@ -333,7 +360,7 @@ func TestAwkRejectsUnsafeFeatures(t *testing.T) { func TestAwkRejectsUnsupportedBuiltinWithoutParens(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "input.txt", "abc\n") - _, stderr, code := cmdRun(t, `awk '{ print length }' input.txt`, dir) + _, stderr, code := cmdRun(t, `awk '{ print split }' input.txt`, dir) assert.Equal(t, 1, code) assert.Contains(t, stderr, "awk: function calls are not supported") } diff --git a/docs/AWK_IMPLEMENTATION_PLAN.md b/docs/AWK_IMPLEMENTATION_PLAN.md index 2a13a9591..349daeef1 100644 --- a/docs/AWK_IMPLEMENTATION_PLAN.md +++ b/docs/AWK_IMPLEMENTATION_PLAN.md @@ -350,16 +350,21 @@ RSHELL_BASH_TEST=1 go test ./tests/ -run TestShellScenariosAgainstBash -timeout ## Later Phases -Phase 2 candidates: +Phase 2 started scope: - `printf` - `if` - `next` +- common scalar builtins: `length`, `substr`, `index`, `tolower`, `toupper`, + `int` + +Remaining Phase 2 candidates: + - range patterns - regex `FS` - field assignment and `$0` rebuilding -- common string builtins: `length`, `substr`, `index`, `split`, `tolower`, - `toupper`, `int` +- `split`, once array support is available or a narrow safe representation is + chosen Phase 3 candidates: diff --git a/tests/awk_scenarios/enabled.txt b/tests/awk_scenarios/enabled.txt index d237b1777..32f784a2e 100644 --- a/tests/awk_scenarios/enabled.txt +++ b/tests/awk_scenarios/enabled.txt @@ -1,22 +1,34 @@ gawk/basic/begin_end_records.yaml gawk/basic/field_separator.yaml +gawk/control/if_else.yaml gawk/expressions/appended_numeric_string_reconverts.yaml +gawk/expressions/arithmetic_comparison.yaml gawk/expressions/concat_literal_punctuation.yaml gawk/expressions/leading_digit_exponent_fragment.yaml +gawk/expressions/negative_fraction_integer_format.yaml gawk/expressions/nondecimal_string_parameter.yaml gawk/expressions/numeric_string_division.yaml +gawk/expressions/numeric_substr_padding.yaml +gawk/expressions/string_concatenation.yaml gawk/expressions/string_constant_numeric_comparison.yaml gawk/expressions/string_field_number_reference.yaml gawk/expressions/unary_minus_string_operand.yaml gawk/expressions/unary_plus_preserves_decimal_string_value.yaml gawk/fields/numeric_field_terminator.yaml +gawk/functions/printf_width_precision_mix.yaml +gawk/functions/string_core.yaml gawk/input/no_trailing_newline_regex.yaml gawk/input/nr_concat_builtin_records.yaml gawk/misc/begin_print_hello.yaml gawk/misc/last_field_concat_once.yaml gawk/misc/nested_self_compound_assignment.yaml +gawk/misc/printf_plus_flag_decimal.yaml gawk/output/hex_input_numeric_conversion.yaml +gawk/output/integer_precision_padding.yaml gawk/output/print_separators.yaml +gawk/output/printf_format.yaml +gawk/output/printf_zero_precision_hex_resets_alternate.yaml +gawk/output/zero_flag_ignored_with_integer_precision.yaml gawk/records/fs_single_backslash.yaml gawk/regex/dfa_nested_closure_alternation.yaml gawk/regex/escaped_left_brace_literal.yaml @@ -29,7 +41,9 @@ onetrueawk/basic/record_counter_nr.yaml onetrueawk/core/custom_ors_without_final_newline.yaml onetrueawk/core/end_record_count.yaml onetrueawk/core/field_reference_order.yaml +onetrueawk/core/if_truthy_fields.yaml onetrueawk/core/inline_comments_inside_action.yaml +onetrueawk/core/next_skips_later_action.yaml onetrueawk/core/not_operator_patterns.yaml onetrueawk/core/numeric_field_comparison_pattern.yaml onetrueawk/core/numeric_literal_regex_pattern.yaml @@ -41,19 +55,24 @@ onetrueawk/core/regex_match_operator.yaml onetrueawk/core/running_sum_and_final_total.yaml onetrueawk/core/tt01_print_records.yaml onetrueawk/core/tt02_nr_nf_record.yaml +onetrueawk/core/tt03_sum_second_field_lengths.yaml onetrueawk/core/tt07_even_field_count_pattern.yaml +onetrueawk/core/tt08_even_record_length_pattern.yaml onetrueawk/core/tt09_empty_record_pattern.yaml onetrueawk/core/tt10_nonempty_end_pattern.yaml +onetrueawk/core/tt11_fixed_substr.yaml onetrueawk/core/uninitialized_concat_prefix.yaml onetrueawk/expressions/number_string_conversion.yaml onetrueawk/expressions/numeric_string_exclusions.yaml onetrueawk/expressions/string_range_comparisons.yaml +onetrueawk/expressions/uninitialized_numeric_coercion.yaml onetrueawk/fields/colon_field_separator.yaml onetrueawk/fields/field_regex_condition.yaml onetrueawk/fixtures/t_1_x_concatenated_assignment.yaml onetrueawk/fixtures/t_4_x_parenthesized_field_reference.yaml onetrueawk/fixtures/t_6_x_nf_and_record_printing.yaml onetrueawk/fixtures/t_d_x_colon_separator_nf.yaml +onetrueawk/fixtures/t_longstr_literal_preserved.yaml onetrueawk/fixtures/t_monotone_optional_regex_chain.yaml onetrueawk/fixtures/t_quote_field_with_literal_quotes.yaml onetrueawk/fixtures/t_sep_digit_field_separator.yaml @@ -63,13 +82,18 @@ onetrueawk/fixtures/t_vf_dynamic_field_read.yaml onetrueawk/fixtures/t_x_regex_default_print.yaml onetrueawk/fixtures/tt_03a_third_field_sum.yaml onetrueawk/fixtures/tt_10a_dynamic_dot_end_regex.yaml +onetrueawk/functions/index_substring_positions.yaml +onetrueawk/functions/substr_pattern_filters.yaml onetrueawk/output/custom_ofs.yaml onetrueawk/output/ofs_ors_print.yaml +onetrueawk/output/printf_numeric_formats.yaml onetrueawk/programs/constant_string_concatenation.yaml onetrueawk/programs/expression_result_numeric_conversion.yaml onetrueawk/programs/p01_print_records.yaml onetrueawk/programs/p02_print_selected_fields.yaml +onetrueawk/programs/p03_printf_columns.yaml onetrueawk/programs/p04_record_numbers.yaml +onetrueawk/programs/p05_formatted_table.yaml onetrueawk/programs/p06_end_record_count.yaml onetrueawk/programs/p07_numeric_pattern_default_print.yaml onetrueawk/programs/p08_field_equality_action.yaml @@ -88,14 +112,21 @@ onetrueawk/programs/p20_compound_condition.yaml onetrueawk/programs/p21_field_or_continent.yaml onetrueawk/programs/p21a_record_regex_or.yaml onetrueawk/programs/p22_anchored_alternation_field_regex.yaml +onetrueawk/programs/p25_ratio_printf.yaml onetrueawk/programs/p26_accumulate_asia_long_assignment.yaml onetrueawk/programs/p26a_accumulate_asia_compound_assignment.yaml onetrueawk/programs/p27_maximum_numeric_field.yaml onetrueawk/programs/p28_nr_colon_record_concat.yaml +onetrueawk/programs/p30_length_builtin_current_record.yaml +onetrueawk/programs/p31_longest_first_field.yaml +onetrueawk/programs/p33_concatenate_substrings_end.yaml onetrueawk/programs/p37_concatenated_field_equality.yaml +onetrueawk/programs/p38_block_if_maximum.yaml onetrueawk/programs/p45_ofs_ors_print.yaml onetrueawk/programs/p46_adjacent_field_concatenation.yaml +onetrueawk/programs/p5a_tabular_header_printf.yaml onetrueawk/programs/regular_expression_operator_matrix.yaml +onetrueawk/records/longest_record.yaml onetrueawk/records/modulo_pattern_default_print.yaml onetrueawk/records/sum_count_average.yaml onetrueawk/regex/compound_pattern_conditions.yaml diff --git a/tests/scenarios/cmd/awk/basic/if_multiline_body.yaml b/tests/scenarios/cmd/awk/basic/if_multiline_body.yaml new file mode 100644 index 000000000..90b39e67d --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/if_multiline_body.yaml @@ -0,0 +1,24 @@ +description: awk accepts newlines before if and else branch bodies. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a + skip + b +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ + if ($1 == "skip") + next + else + print $1 + }' input.txt +expect: + stdout: |+ + a + b + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/if_next_printf_builtins.yaml b/tests/scenarios/cmd/awk/basic/if_next_printf_builtins.yaml new file mode 100644 index 000000000..91e2f8b26 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/if_next_printf_builtins.yaml @@ -0,0 +1,19 @@ +description: awk supports if, next, printf, and scalar string builtins. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a 1 + b 22 + skip 5 +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ if ($1 == "skip") next; if ($2 > 9) { printf "%s:%03d:%u\n", toupper($1), $2, 42 } else printf "small:%s:%d:%d:%d:%d:%s\n", tolower($1), int($2 + .9), length, index($0, $2), index($0, ""), substr($0, 2, 2) }' input.txt +expect: + stdout: |+ + small:a:1:3:3:1: 1 + B:022:42 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/printf_parentheses_empty_if.yaml b/tests/scenarios/cmd/awk/basic/printf_parentheses_empty_if.yaml new file mode 100644 index 000000000..39b67240e --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/printf_parentheses_empty_if.yaml @@ -0,0 +1,17 @@ +description: awk supports parenthesized printf args, empty if bodies, and oversized substr lengths. +oracle: gawk +input: + script: |+ + awk 'BEGIN { + if (0); + print "x" + printf("%s:%d\n", "y", 2) + print substr("abc", 1, 1e20) + }' +expect: + stdout: |+ + x + y:2 + abc + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/printf_regex_literal.yaml b/tests/scenarios/cmd/awk/basic/printf_regex_literal.yaml new file mode 100644 index 000000000..4716e7d5a --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/printf_regex_literal.yaml @@ -0,0 +1,11 @@ +description: awk printf accepts a regex literal as the format expression. +oracle: gawk +input: + script: |+ + printf 'foo\nbar\n' | awk '{ printf /foo/; printf "\n" }' +expect: + stdout: |+ + 1 + 0 + stderr: |+ + exit_code: 0