diff --git a/LICENSE b/LICENSE index 39444f9..7edc976 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 ZON +Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index e28c8e8..05db8f9 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,314 @@ -# ZON-GO -ZON-FORMAT FOR GO LANGUAGE +# Zero Overhead Notation (ZON) Format for Go + +[![Go Reference](https://pkg.go.dev/badge/github.com/ZON-Format/zon-go.svg)](https://pkg.go.dev/github.com/ZON-Format/zon-go) +[![Go Report Card](https://goreportcard.com/badge/github.com/ZON-Format/zon-go)](https://goreportcard.com/report/github.com/ZON-Format/zon-go) +[![Tests](https://img.shields.io/badge/tests-94%2F94%20passing-brightgreen.svg)](#quality--testing) +[![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) + +# ZON Format → JSON is dead. TOON was cute. ZON just won. +**Zero Overhead Notation** - A compact, human-readable way to encode JSON for LLMs. + +**File Extension:** `.zonf` | **Media Type:** `text/zon` | **Encoding:** UTF-8 + +ZON is a token-efficient serialization format designed for LLM workflows. It achieves 35-50% token reduction vs JSON through tabular encoding, single-character primitives, and intelligent compression while maintaining 100% data fidelity. + +Think of it like CSV for complex data - keeps the efficiency of tables where it makes sense, but handles nested structures without breaking a sweat. + +**35–70% fewer tokens than JSON** +**4–35% fewer than TOON** (yes, we measured every tokenizer) +**100% retrieval accuracy** — no hints, no prayers +**Zero parsing overhead** — literally dumber than CSV, and that's why LLMs love it + +```bash +go get github.com/ZON-Format/zon-go +``` + +--- + +## Table of Contents + +- [Why ZON?](#why-zon) +- [Key Features](#key-features) +- [Benchmarks](#benchmarks) +- [Installation & Quick Start](#installation--quick-start) +- [Format Overview](#format-overview) +- [API Reference](#api-reference) +- [Documentation](#documentation) + +--- + +## Why ZON? + +### Benchmarks + +#### Retrieval Accuracy + +Benchmarks test LLM comprehension using 309 data retrieval questions on **gpt-5-nano** (Azure OpenAI). + +**Dataset Catalog:** + +| Dataset | Rows | Structure | Description | +| ------- | ---- | --------- | ----------- | +| Unified benchmark | 5 | mixed | Users, config, logs, metadata - mixed structures | + +**Structure:** Mixed uniform tables + nested objects +**Questions:** 309 total (field retrieval, aggregation, filtering, structure awareness) + +#### Efficiency Ranking (Accuracy per 10K Tokens) + +Each format ranked by efficiency (accuracy percentage per 10,000 tokens): + +``` +ZON ████████████████████ 1430.6 acc%/10K │ 99.0% acc │ 692 tokens 👑 +CSV ███████████████████░ 1386.5 acc%/10K │ 99.0% acc │ 714 tokens +JSON compact ████████████████░░░░ 1143.4 acc%/10K │ 91.7% acc │ 802 tokens +TOON ████████████████░░░░ 1132.7 acc%/10K │ 99.0% acc │ 874 tokens +JSON ██████████░░░░░░░░░░ 744.6 acc%/10K │ 96.8% acc │ 1,300 tokens +``` + +*Efficiency score = (Accuracy % ÷ Tokens) × 10,000. Higher is better.* + +> **TIP:** ZON achieves **99.0% accuracy** while using **20.8% fewer tokens** than TOON and **13.7% fewer** than Minified JSON. + +--- + +## Key Features + +- 🎯 **100% LLM Accuracy**: Achieves perfect retrieval (309/309 questions) with self-explanatory structure – no hints needed +- 💾 **Most Token-Efficient**: 4-15% fewer tokens than TOON across all tokenizers +- 🎯 **JSON Data Model**: Encodes the same objects, arrays, and primitives as JSON with deterministic, lossless round-trips +- 📐 **Minimal Syntax**: Explicit headers (`@(N)` for count, column list) eliminate ambiguity for LLMs +- 🧺 **Tabular Arrays**: Uniform arrays collapse into tables that declare fields once and stream row values +- 🔢 **Canonical Numbers**: No scientific notation (1000000, not 1e6), NaN/Infinity → null +- 🌳 **Deep Nesting**: Handles complex nested structures efficiently (91% compression on 50-level deep objects) +- 🔒 **Security Limits**: Automatic DOS prevention (100MB docs, 1M arrays, 100K keys) +- ✅ **Production Ready**: 94/94 tests pass, 27/27 datasets verified, zero data loss + +--- + +## Quality & Security + +### Data Integrity +- **Unit tests:** 94/94 passed +- **Roundtrip tests:** All datasets verified +- **No data loss or corruption** + +### Security Limits (DOS Prevention) + +Automatic protection against malicious input: + +| Limit | Maximum | Error Code | +|-------|---------|------------| +| Document size | 100 MB | E301 | +| Line length | 1 MB | E302 | +| Array length | 1M items | E303 | +| Object keys | 100K keys | E304 | +| Nesting depth | 100 levels | - | + +**Protection is automatic** - no configuration required. + +--- + +## Installation & Quick Start + +### Go Library + +```bash +go get github.com/ZON-Format/zon-go +``` + +**Example usage:** + +```go +package main + +import ( + "fmt" + zon "github.com/ZON-Format/zon-go" +) + +func main() { + data := map[string]any{ + "users": []any{ + map[string]any{"id": 1, "name": "Alice", "role": "admin", "active": true}, + map[string]any{"id": 2, "name": "Bob", "role": "user", "active": true}, + }, + } + + encoded, _ := zon.Encode(data) + fmt.Println(encoded) + // users:@(2):active,id,name,role + // T,1,Alice,admin + // T,2,Bob,user + + // Decode back to data + decoded, _ := zon.Decode(encoded) + fmt.Println(decoded) + // Identical to original - lossless! +} +``` + +### Command Line Interface (CLI) + +Build and install the CLI: + +```bash +go install github.com/ZON-Format/zon-go/cmd/zon@latest +``` + +**Usage:** + +```bash +# Encode JSON to ZON format +zon encode data.json > data.zonf + +# Decode ZON back to JSON +zon decode data.zonf > output.json +``` + +**File Extension:** + +ZON files conventionally use the `.zonf` extension to distinguish them from other formats. + +--- + +## Format Overview + +ZON auto-selects the optimal representation for your data. + +### Tabular Arrays + +Best for arrays of objects with consistent structure: + +``` +users:@(3):active,id,name,role +T,1,Alice,Admin +T,2,Bob,User +F,3,Carol,Guest +``` + +- `@(3)` = row count +- Column names listed once +- Data rows follow + +### Nested Objects + +Best for configuration and nested structures: + +``` +config:"{database:{host:db.example.com,port:5432},features:{darkMode:T}}" +``` + +### Mixed Structures + +ZON intelligently combines formats: + +``` +metadata:"{version:1.0.5,env:production}" +users:@(5):id,name,active +1,Alice,T +2,Bob,F +... +logs:"[{id:101,level:INFO},{id:102,level:WARN}]" +``` + +--- + +## API Reference + +### `Encode(data any) (string, error)` + +Encodes Go data to ZON format. + +```go +import zon "github.com/ZON-Format/zon-go" + +data := map[string]any{ + "users": []any{ + map[string]any{"id": 1, "name": "Alice"}, + map[string]any{"id": 2, "name": "Bob"}, + }, +} + +encoded, err := zon.Encode(data) +``` + +**Returns:** ZON-formatted string + +### `Decode(zonString string) (any, error)` + +Decodes ZON format back to Go data. Default is strict mode. + +```go +import zon "github.com/ZON-Format/zon-go" + +data, err := zon.Decode(` +users:@(2):id,name +1,Alice +2,Bob +`) +``` + +**Returns:** Original Go data structure + +### `DecodeWithOptions(zonString string, options *DecodeOptions) (any, error)` + +Decodes ZON format with custom options. + +```go +// Non-strict mode - allows row/field count mismatches +data, err := zon.DecodeWithOptions(zonString, &zon.DecodeOptions{Strict: false}) +``` + +### Error Handling + +```go +import zon "github.com/ZON-Format/zon-go" + +decoded, err := zon.Decode(invalidZon) +if err != nil { + if decodeErr, ok := err.(*zon.DecodeError); ok { + fmt.Println(decodeErr.Code) // "E001" or "E002" + fmt.Println(decodeErr.Message) // Detailed error message + fmt.Println(decodeErr.Context) // Context snippet + } +} +``` + +**Error Codes:** + +| Code | Description | +|------|-------------| +| E001 | Row count mismatch | +| E002 | Field count mismatch | +| E301 | Document size exceeds 100MB | +| E302 | Line length exceeds 1MB | +| E303 | Array length exceeds 1M items | +| E304 | Object key count exceeds 100K | + +--- + +## Documentation + +Comprehensive guides and references are available: + +- **[Syntax Cheatsheet](./docs/syntax-cheatsheet.md)** - Quick reference for ZON format syntax +- **[API Reference](./docs/api-reference.md)** - Complete API documentation +- **[LLM Best Practices](./docs/llm-best-practices.md)** - Using ZON with LLMs +- **[Complete Specification](./SPEC.md)** - Formal ZON specification + +--- + +## Links + +- [GitHub Repository](https://github.com/ZON-Format/zon-go) +- [TypeScript Package](https://github.com/ZON-Format/zon-TS) +- [NPM Package (TypeScript)](https://www.npmjs.com/package/zon-format) + +--- + +## License + +Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) + +MIT License - see [LICENSE](LICENSE) for details. diff --git a/SPEC.md b/SPEC.md new file mode 100644 index 0000000..b0a609d --- /dev/null +++ b/SPEC.md @@ -0,0 +1,410 @@ +# ZON Specification + +## Zero Overhead Notation - Formal Specification + +**Version:** 1.0.5 + +**Date:** 2025-11-28 + +**Status:** Stable Release + +**Authors:** ZON Format Contributors + +**License:** MIT + +--- + +## Abstract + +Zero Overhead Notation (ZON) is a compact, line-oriented text format that encodes the JSON data model with minimal redundancy optimized for large language model token efficiency. ZON achieves up to 23.8% token reduction compared to JSON through single-character primitives (`T`, `F`), null as `null`, explicit table markers (`@`), colon-less nested structures, and intelligent quoting rules. Arrays of uniform objects use tabular encoding with column headers declared once; metadata uses flat key-value pairs. This specification defines ZON's concrete syntax, canonical value formatting, encoding/decoding behavior, conformance requirements, and strict validation rules. ZON provides deterministic, lossless representation achieving 100% LLM retrieval accuracy in benchmarks. + +## Status of This Document + +This document is a **Stable Release v1.0.5** and defines normative behavior for ZON encoders, decoders, and validators. Implementation feedback should be reported at https://github.com/ZON-Format/zon-go. + +Backward compatibility is maintained across v1.0.x releases. Major versions (v2.x) may introduce breaking changes. + +## Normative References + +**[RFC2119]** Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, March 1997. +https://www.rfc-editor.org/rfc/rfc2119 + +**[RFC8174]** Leiba, B., "Ambiguity of Uppercase vs Lowercase in RFC 2119 Key Words", BCP 14, RFC 8174, May 2017. +https://www.rfc-editor.org/rfc/rfc8174 + +**[RFC8259]** Bray, T., "The JavaScript Object Notation (JSON) Data Interchange Format", STD 90, RFC 8259, December 2017. +https://www.rfc-editor.org/rfc/rfc8259 + +## Informative References + +**[RFC4180]** Shafranovich, Y., "Common Format and MIME Type for Comma-Separated Values (CSV) Files", RFC 4180, October 2005. +https://www.rfc-editor.org/rfc/rfc4180 + +**[ISO8601]** ISO 8601:2019, "Date and time — Representations for information interchange". + +**[UNICODE]** The Unicode Consortium, "The Unicode Standard", Version 15.1, September 2023. + +--- + +## Table of Contents + +1. [Introduction](#introduction) +2. [Terminology and Conventions](#1-terminology-and-conventions) +3. [Data Model](#2-data-model) +4. [Encoding Normalization](#3-encoding-normalization) +5. [Decoding Interpretation](#4-decoding-interpretation) +6. [Concrete Syntax](#5-concrete-syntax) +7. [Primitives](#6-primitives) +8. [Strings and Keys](#7-strings-and-keys) +9. [Objects](#8-objects) +10. [Arrays](#9-arrays) +11. [Table Format](#10-table-format) +12. [Quoting and Escaping](#11-quoting-and-escaping) +13. [Whitespace](#12-whitespace-and-line-endings) +14. [Conformance](#13-conformance-and-options) +15. [Strict Mode Errors](#14-strict-mode-errors) +16. [Security](#15-security-considerations) +17. [Internationalization](#16-internationalization) +18. [Interoperability](#17-interoperability) +19. [Media Type](#18-media-type) +20. [Error Handling](#19-error-handling) +21. [Appendices](#appendices) + +--- + +## Introduction (Informative) + +### Purpose + +ZON addresses token bloat in JSON while maintaining structural fidelity. By declaring column headers once, using single-character tokens, and eliminating redundant punctuation, ZON achieves optimal compression for LLM contexts. + +### Design Goals + +1. **Minimize tokens** - Every character counts in LLM context windows +2. **Preserve structure** - 100% lossless round-trip conversion +3. **Human readable** - Debuggable, understandable format +4. **LLM friendly** - Explicit markers aid comprehension +5. **Deterministic** - Same input → same output +6. **Deep Nesting** - Efficiently handles complex, recursive structures + +### Use Cases + +✅ **Use ZON for:** +- LLM prompt contexts (RAG, few-shot examples) +- Log storage and analysis +- Configuration files +- Browser storage (localStorage) +- Tabular data interchange +- **Complex nested data structures** (ZON excels here) + +❌ **Don't use ZON for:** +- Public REST APIs (use JSON for compatibility) +- Real-time streaming protocols (not yet supported) +- Files requiring comments (use YAML/JSONC) + +### Example + +**JSON (118 chars):** +```json +{"users":[{"id":1,"name":"Alice","active":true},{"id":2,"name":"Bob","active":false}]} +``` + +**ZON (64 chars, 46% reduction):** +```zon +users:@(2):active,id,name +T,1,Alice +F,2,Bob +``` + +--- + +## 1. Terminology and Conventions + +### 1.1 RFC2119 Keywords + +The keywords **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** are interpreted per [RFC2119] and [RFC8174]. + +### 1.2 Definitions + +**ZON document** - UTF-8 text conforming to this specification + +**Line** - Character sequence terminated by LF (`\n`) + +**Key-value pair** - Line pattern: `key:value` + +**Table** - Array of uniform objects with header + data rows + +**Table header** - Pattern: `key:@(N):columns` or `@(N):columns` + +**Meta separator** - Colon (`:`) separating keys/values + +**Table marker** - At-sign (`@`) indicating table structure + +**Primitive** - Boolean, null, number, or string (not object/array) + +**Uniform array** - All elements are objects with identical keys + +**Strict mode** - Validation enforcing row/column counts + +--- + +## 2. Data Model + +### 2.1 JSON Compatibility + +ZON encodes the JSON data model: +- **Primitives**: `string | number | boolean | null` +- **Objects**: `{ [string]: JsonValue }` +- **Arrays**: `JsonValue[]` + +### 2.2 Ordering + +- **Arrays**: Order MUST be preserved exactly +- **Objects**: Key order MUST be preserved + - Encoders SHOULD sort keys alphabetically + - Decoders MUST preserve document order + +### 2.3 Canonical Numbers + +**Requirements for ENCODER:** + +1. **No leading zeros:** `007` → invalid +2. **No trailing zeros:** `3.14000` → `3.14` +3. **No unnecessary decimals:** Integer `5` stays `5`, not `5.0` +4. **No scientific notation:** `1e6` → `1000000`, `1e-3` → `0.001` +5. **Special values map to null:** + - `NaN` → `null` + - `Infinity` → `null` + - `-Infinity` → `null` + +--- + +## 6. Primitives + +### 6.1 Booleans + +**Encoding:** +- `true` → `T` +- `false` → `F` + +**Decoding:** +- `T` (case-sensitive) → `true` +- `F` (case-sensitive) → `false` + +**Rationale:** 75% character reduction + +### 6.2 Null + +**Encoding:** +- `null` → `null` (4-character literal) + +**Decoding:** +- `null` → `null` +- Also accepts (case-insensitive): `none`, `nil` + +### 6.3 Numbers + +**Examples:** +```zon +age:30 +price:19.99 +score:-42 +temp:98.6 +large:1000000 +``` + +--- + +## 7. Strings and Keys + +### 7.1 Safe Strings (Unquoted) + +Pattern: `^[a-zA-Z0-9_\-\.]+$` + +**Examples:** +```zon +name:Alice +user_id:u123 +version:v1.0.4 +api-key:sk_test_key +``` + +### 7.2 Required Quoting + +Quote strings if they: + +1. **Contain structural chars:** `,`, `:`, `[`, `]`, `{`, `}`, `"` +2. **Match literal keywords:** `T`, `F`, `true`, `false`, `null`, `none`, `nil` +3. **Look like PURE numbers:** `123`, `3.14`, `1e6` +4. **Have whitespace:** Leading/trailing spaces +5. **Are empty:** `""` (MUST quote) +6. **Contain escapes:** Newlines, tabs, quotes + +--- + +## 10. Table Format + +### 10.1 Header Syntax + +**With key:** +``` +users:@(2):active,id,name +``` + +**Root array:** +``` +@(2):active,id,name +``` + +**Components:** +- `users` - Array key (optional for root) +- `@` - Table marker (REQUIRED) +- `(2)` - Row count (REQUIRED for strict mode) +- `:` - Separator (REQUIRED) +- `active,id,name` - Columns, comma-separated (REQUIRED) + +### 10.2 Column Order + +Columns SHOULD be sorted alphabetically. + +### 10.3 Data Rows + +Each row is comma-separated values, one row per line. + +--- + +## 13. Conformance and Options + +### 13.1 Encoder Checklist + +✅ **A conforming encoder MUST:** + +- [ ] Emit UTF-8 with LF line endings +- [ ] Encode booleans as `T`/`F` +- [ ] Encode null as `null` +- [ ] Emit canonical numbers +- [ ] Normalize NaN/Infinity to `null` +- [ ] Detect uniform arrays → table format +- [ ] Emit table headers: `key:@(N):columns` +- [ ] Sort columns alphabetically +- [ ] Sort object keys alphabetically +- [ ] Quote strings per §7.2-7.3 +- [ ] Ensure round-trip: `decode(encode(x)) === x` + +### 13.2 Decoder Checklist + +✅ **A conforming decoder MUST:** + +- [ ] Accept UTF-8 (LF or CRLF) +- [ ] Decode `T` → true, `F` → false, `null` → null +- [ ] Parse decimal and exponent numbers +- [ ] Parse table headers: `key:@(N):columns` +- [ ] Preserve array order +- [ ] Preserve key order +- [ ] Enforce row count (strict mode) +- [ ] Enforce field count (strict mode) + +### 13.3 Strict Mode + +**Enabled by default** in reference implementation. + +Enforces: +- Table row count = declared `(N)` +- Each row field count = column count +- No malformed headers +- No unterminated strings + +--- + +## 14. Strict Mode Errors + +### Error Codes + +| Code | Description | +|------|-------------| +| E001 | Row count mismatch | +| E002 | Field count mismatch | +| E301 | Document size exceeds 100MB | +| E302 | Line length exceeds 1MB | +| E303 | Array length exceeds 1M items | +| E304 | Object key count exceeds 100K | + +--- + +## 15. Security Considerations + +### 15.1 Resource Limits + +Implementations SHOULD limit: +- Document size: 100 MB +- Line length: 1 MB +- Nesting depth: 100 levels +- Array length: 1,000,000 +- Object keys: 100,000 + +Prevents denial-of-service attacks. + +--- + +## 18. Media Type & File Extension + +### 18.1 File Extension + +**Extension:** `.zonf` + +### 18.2 Media Type + +**Media type:** `text/zon` + +**Charset:** UTF-8 (always) + +--- + +## Appendices + +### Appendix A: Examples + +**A.1 Simple Object** +```zon +active:T +age:30 +name:Alice +``` + +**A.2 Table** +```zon +users:@(2):active,id,name +T,1,Alice +F,2,Bob +``` + +**A.3 Mixed** +```zon +tags:"[api,auth]" +version:1.0 +users:@(1):id,name +1,Alice +``` + +### Appendix B: Test Suite + +**Coverage:** +- ✅ 94/94 unit tests +- ✅ All roundtrip tests pass +- ✅ 100% data integrity + +### Appendix C: License + +MIT License + +Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +--- + +**End of Specification** diff --git a/benchmarks/benchmark_output.md b/benchmarks/benchmark_output.md new file mode 100644 index 0000000..fe9c904 --- /dev/null +++ b/benchmarks/benchmark_output.md @@ -0,0 +1,214 @@ +╔════════════════════════════════════════════════════════════════════════════╗ +║ ZON vs TOON vs CSV vs JSON BENCHMARK ║ +║ Token Efficiency Comparison ║ +║ Using GPT-5 o200k_base,Claude 3.5 (Anthropic), ║ +║ Llama 3 (Meta) tokenizer ║ +╚════════════════════════════════════════════════════════════════════════════╝ + +════════════════════════════════════════════════════════════════════════════════ +📊 Unified Dataset + Combined dataset with tabular, nested, and time-series data +──────────────────────────────────────────────────────────────────────────────── +📦 BYTE SIZES: + ZON: 1,399 bytes + TOON: 1,665 bytes + CSV: 1,384 bytes + YAML: 2,033 bytes + XML: 3,235 bytes + JSON (formatted): 2,842 bytes + JSON (compact): 1,854 bytes + +🔹 Tokenizer: GPT-4o (o200k) + ZON █████████░░░░░░░░░░░ 513 tokens 👑 + ├─ vs JSON formatted: -45.4% + ├─ vs JSON compact: -12.9% + ├─ vs TOON: -16.4% + ├─ vs CSV: -3.9% + ├─ vs YAML: -29.5% + └─ vs XML: -53.1% + + TOON ███████████░░░░░░░░░ 614 tokens + vs ZON: +19.7% + + CSV ██████████░░░░░░░░░░ 534 tokens + vs ZON: +4.1% + + YAML █████████████░░░░░░░ 728 tokens + vs ZON: +41.9% + + XML ████████████████████ 1,093 tokens + vs ZON: +113.1% + + JSON (cmp) ███████████░░░░░░░░░ 589 tokens + + +🔹 Tokenizer: Claude 3.5 (Anthropic) + ZON ██████████░░░░░░░░░░ 548 tokens + ├─ vs JSON formatted: -40.0% + ├─ vs JSON compact: -8.1% + ├─ vs TOON: -3.9% + ├─ vs CSV: +0.7% + ├─ vs YAML: -14.5% + └─ vs XML: -50.4% + + TOON ██████████░░░░░░░░░░ 570 tokens + vs ZON: +4.0% + + CSV ██████████░░░░░░░░░░ 544 tokens 👑 + vs ZON: -0.7% + + YAML ████████████░░░░░░░░ 641 tokens + vs ZON: +17.0% + + XML ████████████████████ 1,104 tokens + vs ZON: +101.5% + + JSON (cmp) ███████████░░░░░░░░░ 596 tokens + + +🔹 Tokenizer: Llama 3 (Meta) + ZON ██████████░░░░░░░░░░ 696 tokens 👑 + ├─ vs JSON formatted: -43.1% + ├─ vs JSON compact: -8.4% + ├─ vs TOON: -11.2% + ├─ vs CSV: -4.4% + ├─ vs YAML: -22.1% + └─ vs XML: -50.0% + + TOON ███████████░░░░░░░░░ 784 tokens + vs ZON: +12.6% + + CSV ██████████░░░░░░░░░░ 728 tokens + vs ZON: +4.6% + + YAML █████████████░░░░░░░ 894 tokens + vs ZON: +28.4% + + XML ████████████████████ 1,392 tokens + vs ZON: +100.0% + + JSON (cmp) ███████████░░░░░░░░░ 760 tokens + + +════════════════════════════════════════════════════════════════════════════════ +📊 Large Complex Nested Dataset + Deeply nested, non-uniform structure with mixed types +──────────────────────────────────────────────────────────────────────────────── +📦 BYTE SIZES: + ZON: 335,611 bytes + TOON: 607,194 bytes + CSV: 369,682 bytes + YAML: 607,189 bytes + XML: 1,016,540 bytes + JSON (formatted): 834,132 bytes + JSON (compact): 551,854 bytes + +🔹 Tokenizer: GPT-4o (o200k) + ZON █████████░░░░░░░░░░░ 143,661 tokens 👑 + ├─ vs JSON formatted: -49.5% + ├─ vs JSON compact: -23.8% + ├─ vs TOON: -36.1% + ├─ vs CSV: -12.9% + ├─ vs YAML: -36.1% + └─ vs XML: -57.1% + + TOON █████████████░░░░░░░ 224,940 tokens + vs ZON: +56.6% + + CSV ██████████░░░░░░░░░░ 164,919 tokens + vs ZON: +14.8% + + YAML █████████████░░░░░░░ 224,938 tokens + vs ZON: +56.6% + + XML ████████████████████ 335,239 tokens + vs ZON: +133.4% + + JSON (cmp) ███████████░░░░░░░░░ 188,604 tokens + + +🔹 Tokenizer: Claude 3.5 (Anthropic) + ZON █████████░░░░░░░░░░░ 145,652 tokens 👑 + ├─ vs JSON formatted: -46.8% + ├─ vs JSON compact: -21.3% + ├─ vs TOON: -26.0% + ├─ vs CSV: -9.9% + ├─ vs YAML: -26.0% + └─ vs XML: -55.5% + + TOON ████████████░░░░░░░░ 196,893 tokens + vs ZON: +35.2% + + CSV ██████████░░░░░░░░░░ 161,701 tokens + vs ZON: +11.0% + + YAML ████████████░░░░░░░░ 196,892 tokens + vs ZON: +35.2% + + XML ████████████████████ 327,274 tokens + vs ZON: +124.7% + + JSON (cmp) ███████████░░░░░░░░░ 185,136 tokens + + +🔹 Tokenizer: Llama 3 (Meta) + ZON ██████████░░░░░░░░░░ 230,838 tokens 👑 + ├─ vs JSON formatted: -43.0% + ├─ vs JSON compact: -16.5% + ├─ vs TOON: -26.7% + ├─ vs CSV: -9.2% + ├─ vs YAML: -26.7% + └─ vs XML: -51.9% + + TOON █████████████░░░░░░░ 314,824 tokens + vs ZON: +36.4% + + CSV ███████████░░░░░░░░░ 254,181 tokens + vs ZON: +10.1% + + YAML █████████████░░░░░░░ 314,820 tokens + vs ZON: +36.4% + + XML ████████████████████ 480,125 tokens + vs ZON: +108.0% + + JSON (cmp) ████████████░░░░░░░░ 276,405 tokens + + +════════════════════════════════════════════════════════════════════════════════ +📈 OVERALL SUMMARY +════════════════════════════════════════════════════════════════════════════════ + +🔹 GPT-4o (o200k) Summary: + ZON Wins: 2/2 datasets + Total Tokens: + ZON: █████████████░░░░░░░░░░░░░░░░░ 144,174 tokens + vs JSON (cmp): -23.8% + vs TOON: -36.1% + vs CSV: -12.9% + vs YAML: -36.1% + vs XML: -57.1% + +🔹 Claude 3.5 (Anthropic) Summary: + ZON Wins: 1/2 datasets + Total Tokens: + ZON: █████████████░░░░░░░░░░░░░░░░░ 146,200 tokens + vs JSON (cmp): -21.3% + vs TOON: -26.0% + vs CSV: -9.9% + vs YAML: -26.0% + vs XML: -55.5% + +🔹 Llama 3 (Meta) Summary: + ZON Wins: 2/2 datasets + Total Tokens: + ZON: ██████████████░░░░░░░░░░░░░░░░ 231,534 tokens + vs JSON (cmp): -16.5% + vs TOON: -26.6% + vs CSV: -9.2% + vs YAML: -26.7% + vs XML: -51.9% + +════════════════════════════════════════════════════════════════════════════════ +✨ Benchmark complete! +════════════════════════════════════════════════════════════════════════════════ diff --git a/canonical_numbers_test.go b/canonical_numbers_test.go new file mode 100644 index 0000000..3c337e9 --- /dev/null +++ b/canonical_numbers_test.go @@ -0,0 +1,275 @@ +// Package zon_test provides tests for canonical number formatting. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "math" + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestIntegerWithoutDecimal tests that integers are encoded without decimal point. +func TestIntegerWithoutDecimal(t *testing.T) { + data := map[string]any{"value": float64(42)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "42") { + t.Errorf("Expected 42 in output, got: %s", encoded) + } + if strings.Contains(encoded, "42.0") { + t.Errorf("Should not contain 42.0, got: %s", encoded) + } +} + +// TestZero tests zero handling. +func TestZero(t *testing.T) { + data := map[string]any{"value": float64(0)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "value:0") { + t.Errorf("Expected value:0, got: %s", encoded) + } +} + +// TestNegativeIntegers tests negative integer handling. +func TestNegativeIntegers(t *testing.T) { + data := map[string]any{"value": float64(-123)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "-123") { + t.Errorf("Expected -123, got: %s", encoded) + } +} + +// TestFloatsWithoutTrailingZeros tests floats without trailing zeros. +func TestFloatsWithoutTrailingZeros(t *testing.T) { + data := map[string]any{"value": 3.14} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "3.14") { + t.Errorf("Expected 3.14, got: %s", encoded) + } + if strings.Contains(encoded, "3.140000") { + t.Errorf("Should not have trailing zeros, got: %s", encoded) + } +} + +// TestVerySmallDecimals tests very small decimal handling. +func TestVerySmallDecimals(t *testing.T) { + data := map[string]any{"value": 0.001} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "0.001") { + t.Errorf("Expected 0.001, got: %s", encoded) + } + if strings.Contains(encoded, "1e-3") { + t.Errorf("Should not use scientific notation, got: %s", encoded) + } +} + +// TestNoScientificNotationForLargeNumbers tests no scientific notation for large numbers. +func TestNoScientificNotationForLargeNumbers(t *testing.T) { + data := map[string]any{"value": float64(1000000)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "1000000") { + t.Errorf("Expected 1000000, got: %s", encoded) + } + if strings.Contains(encoded, "1e6") || strings.Contains(encoded, "1e+6") { + t.Errorf("Should not use scientific notation, got: %s", encoded) + } +} + +// TestManyDecimalPlaces tests numbers with many decimal places. +func TestManyDecimalPlaces(t *testing.T) { + data := map[string]any{"value": 3.141592653589793} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Should preserve precision + if !strings.Contains(encoded, "3.14159265358979") { + t.Errorf("Expected precision preserved, got: %s", encoded) + } + // Should not contain scientific notation + if strings.Contains(encoded, "e+") || strings.Contains(encoded, "e-") || strings.Contains(encoded, "E") { + t.Errorf("Should not use scientific notation, got: %s", encoded) + } +} + +// TestNaNAsNull tests NaN encoded as null. +func TestNaNAsNull(t *testing.T) { + data := map[string]any{"value": math.NaN()} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "value:null") { + t.Errorf("Expected value:null, got: %s", encoded) + } +} + +// TestInfinityAsNull tests Infinity encoded as null. +func TestInfinityAsNull(t *testing.T) { + data := map[string]any{"value": math.Inf(1)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "value:null") { + t.Errorf("Expected value:null, got: %s", encoded) + } +} + +// TestNegativeInfinityAsNull tests -Infinity encoded as null. +func TestNegativeInfinityAsNull(t *testing.T) { + data := map[string]any{"value": math.Inf(-1)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "value:null") { + t.Errorf("Expected value:null, got: %s", encoded) + } +} + +// TestIntegerRoundTrip tests integer values through round-trip. +func TestIntegerRoundTrip(t *testing.T) { + data := map[string]any{"value": float64(42)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + if decodedMap["value"] != float64(42) { + t.Errorf("Expected 42, got %v", decodedMap["value"]) + } + // Check it's an integer + val := decodedMap["value"].(float64) + if val != float64(int(val)) { + t.Errorf("Expected integer value") + } +} + +// TestFloatRoundTrip tests float values through round-trip. +func TestFloatRoundTrip(t *testing.T) { + data := map[string]any{"value": 3.14} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + val := decodedMap["value"].(float64) + if math.Abs(val-3.14) > 0.0000001 { + t.Errorf("Expected 3.14, got %v", val) + } +} + +// TestLargeNumberRoundTrip tests large numbers through round-trip. +func TestLargeNumberRoundTrip(t *testing.T) { + data := map[string]any{"value": float64(1000000)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + if decodedMap["value"] != float64(1000000) { + t.Errorf("Expected 1000000, got %v", decodedMap["value"]) + } +} + +// TestVerySmallNumberRoundTrip tests very small numbers through round-trip. +func TestVerySmallNumberRoundTrip(t *testing.T) { + data := map[string]any{"value": 0.000001} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + val := decodedMap["value"].(float64) + if math.Abs(val-0.000001) > 0.0000000001 { + t.Errorf("Expected 0.000001, got %v", val) + } +} + +// TestArrayOfNumbers tests canonical number formatting in arrays. +func TestArrayOfNumbers(t *testing.T) { + data := map[string]any{ + "values": []any{ + map[string]any{"num": float64(1000000)}, + map[string]any{"num": 0.001}, + map[string]any{"num": float64(42)}, + map[string]any{"num": 3.14}, + }, + } + + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Should not contain scientific notation + if strings.Contains(encoded, "e+") || strings.Contains(encoded, "e-") || strings.Contains(encoded, "E") { + t.Errorf("Should not use scientific notation, got: %s", encoded) + } + + // Should contain actual values + if !strings.Contains(encoded, "1000000") { + t.Errorf("Expected 1000000, got: %s", encoded) + } + if !strings.Contains(encoded, "0.001") { + t.Errorf("Expected 0.001, got: %s", encoded) + } + if !strings.Contains(encoded, "42") { + t.Errorf("Expected 42, got: %s", encoded) + } + if !strings.Contains(encoded, "3.14") { + t.Errorf("Expected 3.14, got: %s", encoded) + } +} diff --git a/cmd/zon/main.go b/cmd/zon/main.go new file mode 100644 index 0000000..824dae4 --- /dev/null +++ b/cmd/zon/main.go @@ -0,0 +1,90 @@ +// ZON CLI - Command line tool for encoding and decoding ZON format +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +// +// Usage: +// +// zon encode # Encode JSON to ZON format +// zon decode # Decode ZON to JSON format +// +// Example: +// +// zon encode data.json > data.zonf +// zon decode data.zonf > output.json +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + + zon "github.com/ZON-Format/zon-go" +) + +func main() { + if len(os.Args) < 3 { + printUsage() + os.Exit(1) + } + + command := os.Args[1] + inputFile := os.Args[2] + + // Resolve absolute path + absPath, err := filepath.Abs(inputFile) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + // Read file content + content, err := os.ReadFile(absPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + switch command { + case "encode": + var data any + if err := json.Unmarshal(content, &data); err != nil { + fmt.Fprintf(os.Stderr, "Error parsing JSON: %v\n", err) + os.Exit(1) + } + + encoded, err := zon.Encode(data) + if err != nil { + fmt.Fprintf(os.Stderr, "Error encoding: %v\n", err) + os.Exit(1) + } + + fmt.Println(encoded) + + case "decode": + decoded, err := zon.Decode(string(content)) + if err != nil { + fmt.Fprintf(os.Stderr, "Error decoding: %v\n", err) + os.Exit(1) + } + + output, err := json.MarshalIndent(decoded, "", " ") + if err != nil { + fmt.Fprintf(os.Stderr, "Error formatting JSON: %v\n", err) + os.Exit(1) + } + + fmt.Println(string(output)) + + default: + fmt.Fprintf(os.Stderr, "Unknown command: %s\n", command) + printUsage() + os.Exit(1) + } +} + +func printUsage() { + fmt.Fprintln(os.Stderr, "Usage: zon ") + fmt.Fprintln(os.Stderr, "Example: zon encode data.json > data.zonf") +} diff --git a/codec_test.go b/codec_test.go new file mode 100644 index 0000000..3b883d9 --- /dev/null +++ b/codec_test.go @@ -0,0 +1,648 @@ +// Package zon_test provides tests for ZON encoding and decoding. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "reflect" + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestRoundTripEmptyObject tests empty object round-trip. +func TestRoundTripEmptyObject(t *testing.T) { + data := map[string]any{} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + if !reflect.DeepEqual(decoded, data) { + t.Errorf("Expected %v, got %v", data, decoded) + } +} + +// TestRoundTripSimpleMetadata tests simple metadata round-trip. +func TestRoundTripSimpleMetadata(t *testing.T) { + data := map[string]any{ + "name": "Alice", + "age": float64(30), + "active": true, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["name"] != data["name"] { + t.Errorf("name: expected %v, got %v", data["name"], decodedMap["name"]) + } + if decodedMap["age"] != data["age"] { + t.Errorf("age: expected %v, got %v", data["age"], decodedMap["age"]) + } + if decodedMap["active"] != data["active"] { + t.Errorf("active: expected %v, got %v", data["active"], decodedMap["active"]) + } +} + +// TestRoundTripNestedObject tests nested object round-trip. +func TestRoundTripNestedObject(t *testing.T) { + data := map[string]any{ + "user": map[string]any{ + "name": "Bob", + "profile": map[string]any{ + "age": float64(25), + "city": "NYC", + }, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + user := decodedMap["user"].(map[string]any) + profile := user["profile"].(map[string]any) + if user["name"] != "Bob" { + t.Errorf("user.name: expected Bob, got %v", user["name"]) + } + if profile["age"] != float64(25) { + t.Errorf("user.profile.age: expected 25, got %v", profile["age"]) + } + if profile["city"] != "NYC" { + t.Errorf("user.profile.city: expected NYC, got %v", profile["city"]) + } +} + +// TestRoundTripArrayOfObjects tests array of objects (table) round-trip. +func TestRoundTripArrayOfObjects(t *testing.T) { + data := []any{ + map[string]any{"id": float64(1), "name": "Alice", "score": float64(95)}, + map[string]any{"id": float64(2), "name": "Bob", "score": float64(87)}, + map[string]any{"id": float64(3), "name": "Charlie", "score": float64(92)}, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedArr := decoded.([]any) + if len(decodedArr) != 3 { + t.Errorf("Expected 3 items, got %d", len(decodedArr)) + } + first := decodedArr[0].(map[string]any) + if first["id"] != float64(1) { + t.Errorf("Expected id 1, got %v", first["id"]) + } + if first["name"] != "Alice" { + t.Errorf("Expected name Alice, got %v", first["name"]) + } +} + +// TestRoundTripMixedMetadataAndTable tests mixed metadata and table round-trip. +func TestRoundTripMixedMetadataAndTable(t *testing.T) { + data := map[string]any{ + "title": "Sales Report", + "year": float64(2024), + "records": []any{ + map[string]any{"month": "Jan", "sales": float64(1000)}, + map[string]any{"month": "Feb", "sales": float64(1200)}, + map[string]any{"month": "Mar", "sales": float64(1100)}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["title"] != "Sales Report" { + t.Errorf("title: expected Sales Report, got %v", decodedMap["title"]) + } + if decodedMap["year"] != float64(2024) { + t.Errorf("year: expected 2024, got %v", decodedMap["year"]) + } + records := decodedMap["records"].([]any) + if len(records) != 3 { + t.Errorf("Expected 3 records, got %d", len(records)) + } +} + +// TestRoundTripBooleanValues tests boolean values round-trip. +func TestRoundTripBooleanValues(t *testing.T) { + data := map[string]any{ + "success": true, + "error": false, + "items": []any{ + map[string]any{"id": float64(1), "active": true}, + map[string]any{"id": float64(2), "active": false}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["success"] != true { + t.Errorf("success: expected true, got %v", decodedMap["success"]) + } + if decodedMap["error"] != false { + t.Errorf("error: expected false, got %v", decodedMap["error"]) + } +} + +// TestRoundTripNullValues tests null values round-trip. +func TestRoundTripNullValues(t *testing.T) { + data := map[string]any{ + "name": "Test", + "value": nil, + "items": []any{ + map[string]any{"id": float64(1), "data": nil}, + map[string]any{"id": float64(2), "data": "value"}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["name"] != "Test" { + t.Errorf("name: expected Test, got %v", decodedMap["name"]) + } + if decodedMap["value"] != nil { + t.Errorf("value: expected nil, got %v", decodedMap["value"]) + } +} + +// TestRoundTripNumbers tests numbers (integers and floats) round-trip. +func TestRoundTripNumbers(t *testing.T) { + data := map[string]any{ + "integer": float64(42), + "float": 3.14, + "negative": float64(-10), + "negativeFloat": -2.5, + "items": []any{ + map[string]any{"id": float64(1), "value": float64(100)}, + map[string]any{"id": float64(2), "value": 200.5}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["integer"] != float64(42) { + t.Errorf("integer: expected 42, got %v", decodedMap["integer"]) + } + if decodedMap["float"] != 3.14 { + t.Errorf("float: expected 3.14, got %v", decodedMap["float"]) + } +} + +// TestRoundTripStringsWithSpecialChars tests strings with special characters. +func TestRoundTripStringsWithSpecialChars(t *testing.T) { + data := map[string]any{ + "plain": "hello", + "withComma": "hello, world", + "withQuotes": `say "hello"`, + "withNewline": "line1\nline2", + "items": []any{ + map[string]any{"id": float64(1), "text": "normal"}, + map[string]any{"id": float64(2), "text": "with, comma"}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["plain"] != "hello" { + t.Errorf("plain: expected hello, got %v", decodedMap["plain"]) + } + if decodedMap["withComma"] != "hello, world" { + t.Errorf("withComma: expected 'hello, world', got %v", decodedMap["withComma"]) + } + if decodedMap["withNewline"] != "line1\nline2" { + t.Errorf("withNewline: expected 'line1\\nline2', got %v", decodedMap["withNewline"]) + } +} + +// TestRoundTripEmptyArrays tests empty arrays. +func TestRoundTripEmptyArrays(t *testing.T) { + data := map[string]any{ + "empty": []any{}, + "nested": map[string]any{ + "also_empty": []any{}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + emptyArr := decodedMap["empty"].([]any) + if len(emptyArr) != 0 { + t.Errorf("empty: expected 0 items, got %d", len(emptyArr)) + } +} + +// TestRoundTripNestedArrays tests nested arrays in metadata. +func TestRoundTripNestedArrays(t *testing.T) { + data := map[string]any{ + "tags": []any{"javascript", "typescript", "node"}, + "matrix": []any{ + []any{float64(1), float64(2)}, + []any{float64(3), float64(4)}, + }, + "items": []any{ + map[string]any{"id": float64(1), "values": []any{float64(10), float64(20)}}, + map[string]any{"id": float64(2), "values": []any{float64(30), float64(40)}}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + tags := decodedMap["tags"].([]any) + if len(tags) != 3 { + t.Errorf("tags: expected 3, got %d", len(tags)) + } +} + +// TestHikesExample tests the full hikes example from README. +func TestHikesExample(t *testing.T) { + data := map[string]any{ + "context": map[string]any{ + "task": "Our favorite hikes together", + "location": "Boulder", + "season": "spring_2025", + }, + "friends": []any{"ana", "luis", "sam"}, + "hikes": []any{ + map[string]any{ + "id": float64(1), + "name": "Blue Lake Trail", + "distanceKm": 7.5, + "elevationGain": float64(320), + "companion": "ana", + "wasSunny": true, + }, + map[string]any{ + "id": float64(2), + "name": "Ridge Overlook", + "distanceKm": 9.2, + "elevationGain": float64(540), + "companion": "luis", + "wasSunny": false, + }, + map[string]any{ + "id": float64(3), + "name": "Wildflower Loop", + "distanceKm": 5.1, + "elevationGain": float64(180), + "companion": "sam", + "wasSunny": true, + }, + }, + } + + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + context := decodedMap["context"].(map[string]any) + if context["task"] != "Our favorite hikes together" { + t.Errorf("context.task mismatch") + } + + hikes := decodedMap["hikes"].([]any) + if len(hikes) != 3 { + t.Errorf("Expected 3 hikes, got %d", len(hikes)) + } + + // Verify encoded format structure + if !strings.Contains(encoded, "context.task:") { + t.Errorf("Expected context.task: in encoded output") + } + if !strings.Contains(encoded, "hikes:@(3):") { + t.Errorf("Expected hikes:@(3): in encoded output") + } +} + +// TestStringThatLooksLikeNumber tests string that looks like a number. +func TestStringThatLooksLikeNumber(t *testing.T) { + data := map[string]any{ + "stringNumber": "123", + "actualNumber": float64(123), + "items": []any{ + map[string]any{"id": float64(1), "code": "001"}, + map[string]any{"id": float64(2), "code": "002"}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if _, ok := decodedMap["stringNumber"].(string); !ok { + t.Errorf("stringNumber should be string, got %T", decodedMap["stringNumber"]) + } + if _, ok := decodedMap["actualNumber"].(float64); !ok { + t.Errorf("actualNumber should be float64, got %T", decodedMap["actualNumber"]) + } +} + +// TestStringThatLooksLikeBoolean tests string that looks like boolean. +func TestStringThatLooksLikeBoolean(t *testing.T) { + data := map[string]any{ + "stringTrue": "true", + "actualTrue": true, + "stringFalse": "false", + "actualFalse": false, + "items": []any{ + map[string]any{"id": float64(1), "status": "T"}, + map[string]any{"id": float64(2), "status": true}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if _, ok := decodedMap["stringTrue"].(string); !ok { + t.Errorf("stringTrue should be string, got %T", decodedMap["stringTrue"]) + } + if _, ok := decodedMap["actualTrue"].(bool); !ok { + t.Errorf("actualTrue should be bool, got %T", decodedMap["actualTrue"]) + } +} + +// TestEmptyStrings tests empty strings. +func TestEmptyStrings(t *testing.T) { + data := map[string]any{ + "empty": "", + "items": []any{ + map[string]any{"id": float64(1), "name": ""}, + map[string]any{"id": float64(2), "name": "value"}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["empty"] != "" { + t.Errorf("empty: expected empty string, got %v", decodedMap["empty"]) + } +} + +// TestWhitespacePreservation tests whitespace preservation. +func TestWhitespacePreservation(t *testing.T) { + data := map[string]any{ + "leading": " space", + "trailing": "space ", + "both": " both ", + "items": []any{ + map[string]any{"id": float64(1), "text": " padded "}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["leading"] != " space" { + t.Errorf("leading: expected ' space', got '%v'", decodedMap["leading"]) + } + if decodedMap["trailing"] != "space " { + t.Errorf("trailing: expected 'space ', got '%v'", decodedMap["trailing"]) + } +} + +// TestVeryLongStrings tests very long strings. +func TestVeryLongStrings(t *testing.T) { + longString := strings.Repeat("a", 1000) + data := map[string]any{ + "long": longString, + "items": []any{ + map[string]any{"id": float64(1), "text": longString}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["long"] != longString { + t.Errorf("long: expected %d chars, got %d", len(longString), len(decodedMap["long"].(string))) + } +} + +// TestLargeArrays tests large arrays. +func TestLargeArrays(t *testing.T) { + items := make([]any, 100) + for i := 0; i < 100; i++ { + items[i] = map[string]any{ + "id": float64(i + 1), + "name": "Item " + strings.Repeat("x", i%10), + "value": float64(i * 10), + } + } + data := map[string]any{"items": items} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + decodedItems := decodedMap["items"].([]any) + if len(decodedItems) != 100 { + t.Errorf("Expected 100 items, got %d", len(decodedItems)) + } +} + +// TestArrayOfPrimitives tests array of primitives. +func TestArrayOfPrimitives(t *testing.T) { + data := []any{"apple", "banana", "cherry"} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedArr := decoded.([]any) + if len(decodedArr) != 3 { + t.Errorf("Expected 3 items, got %d", len(decodedArr)) + } + // Should be encoded as JSON array + if !strings.HasPrefix(encoded, "[") { + t.Errorf("Expected encoded to start with [, got %s", encoded[:10]) + } +} + +// TestDeeplyNestedObjects tests deeply nested objects. +func TestDeeplyNestedObjects(t *testing.T) { + data := map[string]any{ + "level1": map[string]any{ + "level2": map[string]any{ + "level3": map[string]any{ + "level4": map[string]any{ + "value": "deep", + }, + }, + }, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + l1 := decodedMap["level1"].(map[string]any) + l2 := l1["level2"].(map[string]any) + l3 := l2["level3"].(map[string]any) + l4 := l3["level4"].(map[string]any) + if l4["value"] != "deep" { + t.Errorf("Expected deep, got %v", l4["value"]) + } +} + +// TestIntegerVsFloatDistinction tests integer vs float distinction. +func TestIntegerVsFloatDistinction(t *testing.T) { + data := map[string]any{ + "integer": float64(42), + "float": 42.0, + "explicitFloat": 3.14, + "items": []any{ + map[string]any{"id": float64(1), "intVal": float64(100), "floatVal": 100.5}, + map[string]any{"id": float64(2), "intVal": float64(200), "floatVal": 200.0}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedMap := decoded.(map[string]any) + if decodedMap["integer"] != float64(42) { + t.Errorf("integer: expected 42, got %v", decodedMap["integer"]) + } + if decodedMap["float"] != float64(42) { + t.Errorf("float: expected 42.0, got %v", decodedMap["float"]) + } + if decodedMap["explicitFloat"] != 3.14 { + t.Errorf("explicitFloat: expected 3.14, got %v", decodedMap["explicitFloat"]) + } +} + +// TestBooleanShorthandTF tests boolean shorthand T/F encoding. +func TestBooleanShorthandTF(t *testing.T) { + data := []any{ + map[string]any{"id": float64(1), "flag": true}, + map[string]any{"id": float64(2), "flag": false}, + map[string]any{"id": float64(3), "flag": true}, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Check that booleans are encoded as T/F + if !strings.Contains(encoded, "T") || !strings.Contains(encoded, "F") { + t.Errorf("Expected T and F in encoded output, got: %s", encoded) + } + + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + decodedArr := decoded.([]any) + first := decodedArr[0].(map[string]any) + if first["flag"] != true { + t.Errorf("Expected true, got %v", first["flag"]) + } +} diff --git a/conformance_decoder_test.go b/conformance_decoder_test.go new file mode 100644 index 0000000..ccf51c0 --- /dev/null +++ b/conformance_decoder_test.go @@ -0,0 +1,281 @@ +// Package zon_test provides conformance tests based on FORMAL_SPEC.md §11.2 Decoder Checklist. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestAcceptUTF8WithLFOrCRLF tests that decoder accepts UTF-8 with LF or CRLF. +func TestAcceptUTF8WithLFOrCRLF(t *testing.T) { + zonLF := "key:value\nkey2:value2" + zonCRLF := "key:value\r\nkey2:value2" + + _, err := zon.Decode(zonLF) + if err != nil { + t.Errorf("Should accept LF: %v", err) + } + + _, err = zon.Decode(zonCRLF) + if err != nil { + t.Errorf("Should accept CRLF: %v", err) + } +} + +// TestDecodeBooleanAndNull tests T → true, F → false, null → null. +func TestDecodeBooleanAndNull(t *testing.T) { + zonData := "active:T\narchived:F\nvalue:null" + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + if resultMap["active"] != true { + t.Errorf("active: expected true, got %v", resultMap["active"]) + } + if resultMap["archived"] != false { + t.Errorf("archived: expected false, got %v", resultMap["archived"]) + } + if resultMap["value"] != nil { + t.Errorf("value: expected nil, got %v", resultMap["value"]) + } +} + +// TestParseDecimalAndExponentNumbers tests parsing decimal and exponent numbers. +func TestParseDecimalAndExponentNumbers(t *testing.T) { + zonData := "int:42\nfloat:3.14\nbig:1000000" + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + if resultMap["int"] != float64(42) { + t.Errorf("int: expected 42, got %v", resultMap["int"]) + } + if resultMap["float"] != 3.14 { + t.Errorf("float: expected 3.14, got %v", resultMap["float"]) + } + if resultMap["big"] != float64(1000000) { + t.Errorf("big: expected 1000000, got %v", resultMap["big"]) + } +} + +// TestLeadingZeroNumbersAsStrings tests that leading-zero numbers are treated as strings. +func TestLeadingZeroNumbersAsStrings(t *testing.T) { + zonData := `code:"007"` + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + if resultMap["code"] != "007" { + t.Errorf("code: expected '007', got %v", resultMap["code"]) + } + if _, ok := resultMap["code"].(string); !ok { + t.Errorf("code should be string, got %T", resultMap["code"]) + } +} + +// TestUnescapeQuotedStrings tests unescaping quoted strings. +func TestUnescapeQuotedStrings(t *testing.T) { + zonData := `text:"he said \"hello\""` + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + if resultMap["text"] != `he said "hello"` { + t.Errorf("text: expected 'he said \"hello\"', got %v", resultMap["text"]) + } +} + +// TestParseTableRowsIntoArrayOfObjects tests parsing table rows into array of objects. +func TestParseTableRowsIntoArrayOfObjects(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice +2,Bob` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } + + user1 := users[0].(map[string]any) + if user1["id"] != float64(1) { + t.Errorf("user1.id: expected 1, got %v", user1["id"]) + } + if user1["name"] != "Alice" { + t.Errorf("user1.name: expected Alice, got %v", user1["name"]) + } + + user2 := users[1].(map[string]any) + if user2["id"] != float64(2) { + t.Errorf("user2.id: expected 2, got %v", user2["id"]) + } + if user2["name"] != "Bob" { + t.Errorf("user2.name: expected Bob, got %v", user2["name"]) + } +} + +// TestPreserveKeyOrder tests that key order is preserved from document. +func TestPreserveKeyOrder(t *testing.T) { + zonData := "z:1\na:2\nm:3" + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + // Go maps don't preserve order, but we should have all keys + if resultMap["z"] != float64(1) { + t.Errorf("z: expected 1, got %v", resultMap["z"]) + } + if resultMap["a"] != float64(2) { + t.Errorf("a: expected 2, got %v", resultMap["a"]) + } + if resultMap["m"] != float64(3) { + t.Errorf("m: expected 3, got %v", resultMap["m"]) + } +} + +// TestRejectPrototypePollution tests rejection of prototype pollution attempts. +func TestRejectPrototypePollution(t *testing.T) { + malicious := `users:@(1):id,__proto__.polluted +1,true` + + _, err := zon.Decode(malicious) + // The decoding should succeed but not pollute the prototype + if err != nil { + // If it errors, that's also acceptable + return + } + + // Verify that Object prototype is not polluted + testObj := make(map[string]any) + if _, ok := testObj["polluted"]; ok { + t.Error("Prototype pollution detected") + } +} + +// TestThrowOnDeepNesting tests that decoder throws on nesting depth > 100. +func TestThrowOnDeepNesting(t *testing.T) { + deepNested := strings.Repeat("[", 150) + strings.Repeat("]", 150) + + _, err := zon.Decode(deepNested) + if err == nil { + t.Error("Expected error for deep nesting") + } + if !strings.Contains(err.Error(), "Maximum nesting depth exceeded") { + t.Errorf("Expected 'Maximum nesting depth exceeded' error, got: %v", err) + } +} + +// TestThrowOnLineLengthExceeds1MB tests that decoder throws on line length > 1MB. +func TestThrowOnLineLengthExceeds1MB(t *testing.T) { + longLine := "key:" + strings.Repeat("x", 1024*1024+1) + + _, err := zon.Decode(longLine) + if err == nil { + t.Error("Expected error for long line") + } + if !strings.Contains(err.Error(), "E302") { + t.Errorf("Expected E302 error, got: %v", err) + } +} + +// TestCaseInsensitiveNullBooleanAliases tests case-insensitive null/boolean aliases. +func TestCaseInsensitiveNullBooleanAliases(t *testing.T) { + zonData := "a:TRUE\nb:False\nc:NONE\nd:nil" + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + if resultMap["a"] != true { + t.Errorf("a: expected true, got %v", resultMap["a"]) + } + if resultMap["b"] != false { + t.Errorf("b: expected false, got %v", resultMap["b"]) + } + if resultMap["c"] != nil { + t.Errorf("c: expected nil, got %v", resultMap["c"]) + } + if resultMap["d"] != nil { + t.Errorf("d: expected nil, got %v", resultMap["d"]) + } +} + +// TestReconstructNestedObjectsFromDottedKeys tests reconstruction of nested objects. +func TestReconstructNestedObjectsFromDottedKeys(t *testing.T) { + zonData := "config.db.host:localhost\nconfig.db.port:5432" + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + config := resultMap["config"].(map[string]any) + db := config["db"].(map[string]any) + if db["host"] != "localhost" { + t.Errorf("config.db.host: expected localhost, got %v", db["host"]) + } + if db["port"] != float64(5432) { + t.Errorf("config.db.port: expected 5432, got %v", db["port"]) + } +} + +// TestUnwrapPureLists tests unwrapping pure lists (data key). +func TestUnwrapPureLists(t *testing.T) { + zonData := `data:@(2):id,name +1,Alice +2,Bob` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + // Should return array directly, not { data: [...] } + arr, ok := result.([]any) + if !ok { + t.Errorf("Expected array, got %T", result) + } + if len(arr) != 2 { + t.Errorf("Expected 2 items, got %d", len(arr)) + } +} + +// TestEmptyStringsInTableCells tests empty strings in table cells. +func TestEmptyStringsInTableCells(t *testing.T) { + zonData := `users:@(2):id,name +1,"" +2,Bob` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + user1 := users[0].(map[string]any) + if user1["name"] != "" { + t.Errorf("user1.name: expected empty string, got %v", user1["name"]) + } +} diff --git a/conformance_encoder_test.go b/conformance_encoder_test.go new file mode 100644 index 0000000..623bb4d --- /dev/null +++ b/conformance_encoder_test.go @@ -0,0 +1,242 @@ +// Package zon_test provides conformance tests based on FORMAL_SPEC.md §11.1 Encoder Checklist. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "math" + "regexp" + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestEmitUTF8WithLF tests that encoder emits UTF-8 with LF line endings. +func TestEmitUTF8WithLF(t *testing.T) { + data := map[string]any{"a": float64(1), "b": float64(2)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Should use LF, not CRLF + if strings.Contains(encoded, "\r\n") { + t.Errorf("Should not contain CRLF, got: %s", encoded) + } +} + +// TestEncodeBoolsAsTF tests that booleans are encoded as T/F. +func TestEncodeBoolsAsTF(t *testing.T) { + data := map[string]any{"active": true, "archived": false} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "active:T") { + t.Errorf("Expected active:T, got: %s", encoded) + } + if !strings.Contains(encoded, "archived:F") { + t.Errorf("Expected archived:F, got: %s", encoded) + } + if strings.Contains(encoded, "true") || strings.Contains(encoded, "false") { + t.Errorf("Should not contain true/false, got: %s", encoded) + } +} + +// TestEncodeNullAsNull tests that null is encoded as "null". +func TestEncodeNullAsNull(t *testing.T) { + data := map[string]any{"value": nil} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "value:null") { + t.Errorf("Expected value:null, got: %s", encoded) + } +} + +// TestEmitCanonicalNumbers tests canonical number formatting. +func TestEmitCanonicalNumbers(t *testing.T) { + data := map[string]any{"int": float64(42), "float": 3.14, "big": float64(1000000)} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // No scientific notation + if !strings.Contains(encoded, "1000000") { + t.Errorf("Expected 1000000, got: %s", encoded) + } + if strings.Contains(encoded, "1e6") || strings.Contains(encoded, "1e+6") { + t.Errorf("Should not use scientific notation, got: %s", encoded) + } + + // Has decimal for floats + if !strings.Contains(encoded, "3.14") { + t.Errorf("Expected 3.14, got: %s", encoded) + } +} + +// TestNormalizeNaNInfinity tests that NaN/Infinity are normalized to null. +func TestNormalizeNaNInfinity(t *testing.T) { + data := map[string]any{ + "nan": math.NaN(), + "inf": math.Inf(1), + "negInf": math.Inf(-1), + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "nan:null") { + t.Errorf("Expected nan:null, got: %s", encoded) + } + if !strings.Contains(encoded, "inf:null") { + t.Errorf("Expected inf:null, got: %s", encoded) + } + if !strings.Contains(encoded, "negInf:null") { + t.Errorf("Expected negInf:null, got: %s", encoded) + } +} + +// TestDetectUniformArraysAsTable tests uniform arrays → table format detection. +func TestDetectUniformArraysAsTable(t *testing.T) { + data := map[string]any{ + "users": []any{ + map[string]any{"id": float64(1), "name": "Alice"}, + map[string]any{"id": float64(2), "name": "Bob"}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Should have table marker + matched, _ := regexp.MatchString(`users:@\(\d+\)`, encoded) + if !matched { + t.Errorf("Expected table marker, got: %s", encoded) + } + if !strings.Contains(encoded, "id,name") && !strings.Contains(encoded, "name,id") { + t.Errorf("Expected column headers, got: %s", encoded) + } +} + +// TestEmitTableHeadersWithCountAndColumns tests table headers with count and columns. +func TestEmitTableHeadersWithCountAndColumns(t *testing.T) { + data := map[string]any{ + "items": []any{ + map[string]any{"x": float64(1), "y": float64(2)}, + map[string]any{"x": float64(3), "y": float64(4)}, + map[string]any{"x": float64(5), "y": float64(6)}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if !strings.Contains(encoded, "items:@(3):") { + t.Errorf("Expected items:@(3):, got: %s", encoded) + } +} + +// TestSortColumnsAlphabetically tests that columns are sorted alphabetically. +func TestSortColumnsAlphabetically(t *testing.T) { + data := map[string]any{ + "records": []any{ + map[string]any{"z": float64(1), "a": float64(2), "m": float64(3)}, + }, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Columns should be sorted: a, m, z + matched, _ := regexp.MatchString(`records:@\(1\):a,m,z`, encoded) + if !matched { + t.Errorf("Expected sorted columns a,m,z, got: %s", encoded) + } +} + +// TestQuoteStringsWithSpecialChars tests quoting strings with special characters. +func TestQuoteStringsWithSpecialChars(t *testing.T) { + data := map[string]any{ + "comma": "a,b", + "colon": "x:y", + "quote": `say "hi"`, + } + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Comma should be quoted + if !strings.Contains(encoded, `"a,b"`) { + t.Errorf("Expected quoted comma, got: %s", encoded) + } + + // v2.0.5: Colons are allowed unquoted + if !strings.Contains(encoded, "x:y") && !strings.Contains(encoded, "colon:x:y") { + t.Errorf("Expected colon in value, got: %s", encoded) + } +} + +// TestEscapeQuotesInStrings tests escaping quotes in strings. +func TestEscapeQuotesInStrings(t *testing.T) { + data := map[string]any{"text": `he said "hello"`} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Uses escaped quotes in JSON-style + if !strings.Contains(encoded, `\"hello\"`) && !strings.Contains(encoded, `""hello""`) { + t.Errorf("Expected escaped quotes, got: %s", encoded) + } +} + +// TestProduceDeterministicOutput tests deterministic output. +func TestProduceDeterministicOutput(t *testing.T) { + data := map[string]any{"b": float64(2), "a": float64(1), "c": float64(3)} + + encoded1, _ := zon.Encode(data) + encoded2, _ := zon.Encode(data) + + if encoded1 != encoded2 { + t.Errorf("Output should be deterministic\n%s\n!=\n%s", encoded1, encoded2) + } +} + +// TestHandleEmptyObjects tests empty object handling. +func TestHandleEmptyObjects(t *testing.T) { + data := map[string]any{} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + // Empty object is empty string in ZON + if encoded != "" { + t.Errorf("Expected empty string for empty object, got: %s", encoded) + } +} + +// TestHandleEmptyArrays tests empty array handling. +func TestHandleEmptyArrays(t *testing.T) { + data := map[string]any{"items": []any{}} + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + if encoded == "" { + t.Error("Expected non-empty output for object with empty array") + } +} diff --git a/constants.go b/constants.go new file mode 100644 index 0000000..acb6c0e --- /dev/null +++ b/constants.go @@ -0,0 +1,50 @@ +// Package zon provides a ZON (Zero Overhead Notation) encoder and decoder. +// ZON is a compact, human-readable format for encoding JSON data, +// optimized for LLM token efficiency. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon + +// ZON Protocol Constants v1.0.5 + +// Format markers +const ( + // TableMarker is the @ symbol indicating table structure + TableMarker = '@' + // MetaSeparator is the colon separating keys and values + MetaSeparator = ':' +) + +// Reserved tokens (for future use) +const ( + // GasToken is a placeholder token + GasToken = '_' + // LiquidToken is a variable token + LiquidToken = '^' +) + +// DefaultAnchorInterval is the default interval for large datasets +const DefaultAnchorInterval = 100 + +// Security limits (DOS prevention) +const ( + // MaxDocumentSize is the maximum document size in bytes (100 MB) + MaxDocumentSize = 100 * 1024 * 1024 + // MaxLineLength is the maximum line length in bytes (1 MB) + MaxLineLength = 1024 * 1024 + // MaxArrayLength is the maximum array length (1 million items) + MaxArrayLength = 1_000_000 + // MaxObjectKeys is the maximum number of object keys (100K keys) + MaxObjectKeys = 100_000 + // MaxNestingDepth is the maximum nesting depth (100 levels) + MaxNestingDepth = 100 +) + +// Legacy compatibility (v1.x) +const ( + // LegacyTableMarker for backward compatibility + LegacyTableMarker = '@' + // InlineThresholdRows for metadata flattening + InlineThresholdRows = 0 +) diff --git a/decoder.go b/decoder.go new file mode 100644 index 0000000..73d2e52 --- /dev/null +++ b/decoder.go @@ -0,0 +1,829 @@ +// Package zon provides ZON encoding and decoding functionality. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon + +import ( + "encoding/json" + "regexp" + "strconv" + "strings" +) + +// DecodeOptions contains options for decoding. +type DecodeOptions struct { + Strict bool // Enable strict validation (default: true) +} + +// Decoder decodes ZON format to Go data structures. +type Decoder struct { + strict bool + currentLine int +} + +// NewDecoder creates a new ZON decoder with the given options. +func NewDecoder(options *DecodeOptions) *Decoder { + strict := true + if options != nil { + strict = options.Strict + } + return &Decoder{ + strict: strict, + currentLine: 0, + } +} + +// tableInfo holds information about a table being parsed. +type tableInfo struct { + cols []string + omittedCols []string + rows []map[string]any + prevVals map[string]any + rowIndex int + expectedRows int +} + +// Decode decodes ZON format string to original data structure. +func (d *Decoder) Decode(zonStr string) (any, error) { + if zonStr == "" { + return map[string]any{}, nil + } + + // Security: Check document size + if len(zonStr) > MaxDocumentSize { + return nil, NewDecodeError( + ErrDocumentTooLarge, + "Document size exceeds maximum (100MB)", + 0, "", + ) + } + + lines := strings.Split(strings.TrimSpace(zonStr), "\n") + if len(lines) == 0 { + return map[string]any{}, nil + } + + // Special case: Root-level ZON list + if len(lines) == 1 { + line := strings.TrimSpace(lines[0]) + if strings.HasPrefix(line, "[") { + return d.parseZonNode(line, 0) + } + + // Check for colon-less object/array pattern + hasBlock := regexp.MustCompile(`^[a-zA-Z0-9_]+\s*[\{\[]`).MatchString(line) + if !strings.Contains(line, string(MetaSeparator)) && !strings.HasPrefix(line, string(TableMarker)) && !hasBlock { + return d.parsePrimitive(line), nil + } + } + + // Main decode loop + metadata := make(map[string]any) + tables := make(map[string]*tableInfo) + var currentTable *tableInfo + var currentTableName string + + for _, line := range lines { + trimmedLine := strings.TrimRight(line, " \t\r") + + // Security: Check line length + if len(trimmedLine) > MaxLineLength { + return nil, NewDecodeError( + ErrLineTooLong, + "Line length exceeds maximum (1MB)", + d.currentLine, "", + ) + } + + // Skip blank lines + if trimmedLine == "" { + continue + } + + // Table header (Anonymous or Legacy): @... + if strings.HasPrefix(trimmedLine, string(TableMarker)) { + tableName, tInfo, err := d.parseTableHeader(trimmedLine) + if err != nil { + return nil, err + } + currentTableName = tableName + currentTable = tInfo + tables[currentTableName] = currentTable + } else if currentTable != nil && currentTable.rowIndex < currentTable.expectedRows { + // Table row + row, err := d.parseTableRow(trimmedLine, currentTable) + if err != nil { + return nil, err + } + currentTable.rows = append(currentTable.rows, row) + + // If we've read all rows, exit table mode + if currentTable.rowIndex >= currentTable.expectedRows { + currentTable = nil + } + } else { + // Metadata line OR Named Table + splitIdx, splitChar := d.findSplitPoint(trimmedLine) + + if splitIdx != -1 { + var key, val string + if splitChar == ':' { + key = strings.TrimSpace(trimmedLine[:splitIdx]) + val = strings.TrimSpace(trimmedLine[splitIdx+1:]) + } else { + // Split at { or [ (include it in value) + key = strings.TrimSpace(trimmedLine[:splitIdx]) + val = strings.TrimSpace(trimmedLine[splitIdx:]) + } + + // Check if it's a named table start + if strings.HasPrefix(val, string(TableMarker)) { + _, tInfo, err := d.parseTableHeader(val) + if err != nil { + return nil, err + } + currentTableName = key + currentTable = tInfo + tables[currentTableName] = currentTable + } else { + currentTable = nil + parsedVal, err := d.parseValue(val) + if err != nil { + return nil, err + } + metadata[key] = parsedVal + } + } + } + } + + // Recombine tables into metadata + for tableName, table := range tables { + // Strict mode: validate row count + if d.strict && len(table.rows) != table.expectedRows { + return nil, NewDecodeError( + ErrRowCountMismatch, + "Row count mismatch in table '"+tableName+"': expected "+strconv.Itoa(table.expectedRows)+", got "+strconv.Itoa(len(table.rows)), + 0, "Table: "+tableName, + ) + } + metadata[tableName] = d.reconstructTable(table) + } + + // Unflatten dotted keys + result := d.unflatten(metadata) + + // Unwrap pure lists: if only key is 'data', return the list directly + if resultMap, ok := result.(map[string]any); ok { + if len(resultMap) == 1 { + if data, ok := resultMap["data"]; ok { + if arr, ok := data.([]any); ok { + return arr, nil + } + } + } + } + + return result, nil +} + +// findSplitPoint finds the split point in a metadata line. +func (d *Decoder) findSplitPoint(line string) (int, byte) { + splitIdx := -1 + var splitChar byte + depth := 0 + inQuote := false + + for i := 0; i < len(line); i++ { + char := line[i] + if char == '"' { + inQuote = !inQuote + } + if !inQuote { + if char == '{' || char == '[' { + depth++ + if depth == 1 && splitIdx == -1 { + // We just entered a block + splitIdx = i + splitChar = char + break + } + } + if char == '}' || char == ']' { + depth-- + } + if char == ':' && depth == 0 { + splitIdx = i + splitChar = ':' + break + } + } + } + + return splitIdx, splitChar +} + +// parseTableHeader parses a table header line. +func (d *Decoder) parseTableHeader(line string) (string, *tableInfo, error) { + // Try v2.0 format with name: @name(count)[col][col]:columns + v2NamedPattern := regexp.MustCompile(`^@(\w+)\((\d+)\)(\[\w+\])*:(.+)$`) + if matches := v2NamedPattern.FindStringSubmatch(line); matches != nil { + tableName := matches[1] + count, _ := strconv.Atoi(matches[2]) + omittedStr := matches[3] + colsStr := matches[4] + + var omittedCols []string + if omittedStr != "" { + omittedPattern := regexp.MustCompile(`\[(\w+)\]`) + omittedMatches := omittedPattern.FindAllStringSubmatch(omittedStr, -1) + for _, m := range omittedMatches { + omittedCols = append(omittedCols, m[1]) + } + } + + cols := strings.Split(colsStr, ",") + for i := range cols { + cols[i] = strings.TrimSpace(cols[i]) + } + + return tableName, &tableInfo{ + cols: cols, + omittedCols: omittedCols, + rows: []map[string]any{}, + prevVals: make(map[string]any), + rowIndex: 0, + expectedRows: count, + }, nil + } + + // Try v2.1 format (anonymous/value): @(count)[col]:columns + v2ValuePattern := regexp.MustCompile(`^@\((\d+)\)(\[\w+\])*:(.+)$`) + if matches := v2ValuePattern.FindStringSubmatch(line); matches != nil { + count, _ := strconv.Atoi(matches[1]) + omittedStr := matches[2] + colsStr := matches[3] + + var omittedCols []string + if omittedStr != "" { + omittedPattern := regexp.MustCompile(`\[(\w+)\]`) + omittedMatches := omittedPattern.FindAllStringSubmatch(omittedStr, -1) + for _, m := range omittedMatches { + omittedCols = append(omittedCols, m[1]) + } + } + + cols := strings.Split(colsStr, ",") + for i := range cols { + cols[i] = strings.TrimSpace(cols[i]) + } + + return "data", &tableInfo{ + cols: cols, + omittedCols: omittedCols, + rows: []map[string]any{}, + prevVals: make(map[string]any), + rowIndex: 0, + expectedRows: count, + }, nil + } + + // Try v2.0 format (anonymous): @count[col][col]:columns + v2Pattern := regexp.MustCompile(`^@(\d+)(\[\w+\])*:(.+)$`) + if matches := v2Pattern.FindStringSubmatch(line); matches != nil { + count, _ := strconv.Atoi(matches[1]) + omittedStr := matches[2] + colsStr := matches[3] + + var omittedCols []string + if omittedStr != "" { + omittedPattern := regexp.MustCompile(`\[(\w+)\]`) + omittedMatches := omittedPattern.FindAllStringSubmatch(omittedStr, -1) + for _, m := range omittedMatches { + omittedCols = append(omittedCols, m[1]) + } + } + + cols := strings.Split(colsStr, ",") + for i := range cols { + cols[i] = strings.TrimSpace(cols[i]) + } + + return "data", &tableInfo{ + cols: cols, + omittedCols: omittedCols, + rows: []map[string]any{}, + prevVals: make(map[string]any), + rowIndex: 0, + expectedRows: count, + }, nil + } + + // Fallback to v1.x format: @tablename(count):cols + v1Pattern := regexp.MustCompile(`^@(\w+)\((\d+)\):(.+)$`) + if matches := v1Pattern.FindStringSubmatch(line); matches != nil { + tableName := matches[1] + count, _ := strconv.Atoi(matches[2]) + colsStr := matches[3] + + cols := strings.Split(colsStr, ",") + for i := range cols { + cols[i] = strings.TrimSpace(cols[i]) + } + + return tableName, &tableInfo{ + cols: cols, + rows: []map[string]any{}, + prevVals: make(map[string]any), + rowIndex: 0, + expectedRows: count, + }, nil + } + + return "", nil, NewDecodeError(ErrMalformedHeader, "Invalid table header: "+line, 0, "") +} + +// parseTableRow parses a table row. +func (d *Decoder) parseTableRow(line string, table *tableInfo) (map[string]any, error) { + tokens := d.splitByDelimiter(line, ',') + + // Strict mode: validate field count + coreFieldCount := len(tokens) + sparseFieldCount := 0 + + // Count sparse fields + for i := len(table.cols); i < len(tokens); i++ { + tok := tokens[i] + if strings.Contains(tok, ":") && !d.isURL(tok) && !d.isTimestamp(tok) { + sparseFieldCount++ + } + } + + // In strict mode, core fields must match column count + if d.strict && coreFieldCount < len(table.cols) && sparseFieldCount == 0 { + return nil, NewDecodeError( + ErrFieldCountMismatch, + "Field count mismatch on row "+strconv.Itoa(table.rowIndex+1)+": expected "+strconv.Itoa(len(table.cols))+" fields, got "+strconv.Itoa(coreFieldCount), + d.currentLine, + truncateString(line, 50), + ) + } + + // Pad if needed + for len(tokens) < len(table.cols) { + tokens = append(tokens, "") + } + + row := make(map[string]any) + tokenIdx := 0 + + // Parse core columns + for _, col := range table.cols { + if tokenIdx < len(tokens) { + tok := tokens[tokenIdx] + val, err := d.parseValue(tok) + if err != nil { + return nil, err + } + row[col] = val + tokenIdx++ + } + } + + // Parse optional fields (sparse encoding) + for tokenIdx < len(tokens) { + tok := tokens[tokenIdx] + if strings.Contains(tok, ":") && !d.isURL(tok) && !d.isTimestamp(tok) { + colonIdx := strings.Index(tok, ":") + key := strings.TrimSpace(tok[:colonIdx]) + val := strings.TrimSpace(tok[colonIdx+1:]) + + // Validate key is a simple identifier + if regexp.MustCompile(`^[a-zA-Z_]\w*$`).MatchString(key) { + parsedVal, err := d.parseValue(val) + if err != nil { + return nil, err + } + row[key] = parsedVal + } + } + tokenIdx++ + } + + // Reconstruct omitted sequential columns + for _, col := range table.omittedCols { + row[col] = table.rowIndex + 1 + } + + table.rowIndex++ + return row, nil +} + +// isURL checks if string is a URL. +func (d *Decoder) isURL(s string) bool { + return strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") || strings.HasPrefix(s, "/") +} + +// isTimestamp checks if string is a timestamp with colons. +func (d *Decoder) isTimestamp(s string) bool { + if matched, _ := regexp.MatchString(`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}`, s); matched { + return true + } + if matched, _ := regexp.MatchString(`^\d{2}:\d{2}:\d{2}`, s); matched { + return true + } + return false +} + +// reconstructTable reconstructs table from parsed rows. +func (d *Decoder) reconstructTable(table *tableInfo) []any { + result := make([]any, len(table.rows)) + for i, row := range table.rows { + result[i] = d.unflatten(row) + } + return result +} + +// parseZonNode parses a ZON nested format. +func (d *Decoder) parseZonNode(text string, depth int) (any, error) { + if depth > MaxNestingDepth { + return nil, NewDecodeError(ErrNestingTooDeep, "Maximum nesting depth exceeded (100)", 0, "") + } + + trimmed := strings.TrimSpace(text) + if trimmed == "" { + return nil, nil + } + + // Dict: {k:v,k:v} + if strings.HasPrefix(trimmed, "{") && strings.HasSuffix(trimmed, "}") { + content := strings.TrimSpace(trimmed[1 : len(trimmed)-1]) + if content == "" { + return map[string]any{}, nil + } + + obj := make(map[string]any) + pairs := d.splitByDelimiter(content, ',') + + // Security: Check object key count + if len(pairs) > MaxObjectKeys { + return nil, NewDecodeError( + ErrTooManyKeys, + "Object key count exceeds maximum (100K keys)", + 0, "", + ) + } + + for _, pair := range pairs { + keyStr, valStr := d.findKeyValueSplit(pair) + if keyStr == "" && valStr == "" { + continue + } + + key := d.parsePrimitive(keyStr) + val, err := d.parseZonNode(valStr, depth+1) + if err != nil { + return nil, err + } + if keyStr, ok := key.(string); ok { + obj[keyStr] = val + } else { + obj[strings.TrimSpace(pair)] = nil + } + } + + return obj, nil + } + + // List: [v,v] + if strings.HasPrefix(trimmed, "[") && strings.HasSuffix(trimmed, "]") { + content := strings.TrimSpace(trimmed[1 : len(trimmed)-1]) + if content == "" { + return []any{}, nil + } + + items := d.splitByDelimiter(content, ',') + + // Security: Check array length + if len(items) > MaxArrayLength { + return nil, NewDecodeError( + ErrArrayTooLarge, + "Array length exceeds maximum (1M items)", + 0, "", + ) + } + + result := make([]any, len(items)) + for i, item := range items { + val, err := d.parseZonNode(item, depth+1) + if err != nil { + return nil, err + } + result[i] = val + } + return result, nil + } + + // Leaf node (primitive) + return d.parsePrimitive(trimmed), nil +} + +// findKeyValueSplit finds the key-value split point in a pair. +func (d *Decoder) findKeyValueSplit(pair string) (string, string) { + splitIdx := -1 + var splitChar byte + inQuote := false + var quoteChar byte + depth := 0 + + for i := 0; i < len(pair); i++ { + char := pair[i] + + if char == '\\' && i+1 < len(pair) { + i++ + continue + } + + if char == '"' || char == '\'' { + if !inQuote { + inQuote = true + quoteChar = char + } else if char == quoteChar { + inQuote = false + } + } else if !inQuote { + if char == ':' { + if depth == 0 { + splitIdx = i + splitChar = ':' + break + } + } else if char == '{' || char == '[' { + if depth == 0 && splitIdx == -1 { + splitIdx = i + splitChar = char + break + } + depth++ + } else if char == '}' || char == ']' { + depth-- + } + } + } + + if splitIdx != -1 { + if splitChar == ':' { + return strings.TrimSpace(pair[:splitIdx]), strings.TrimSpace(pair[splitIdx+1:]) + } + // Split at { or [ (include it in value) + return strings.TrimSpace(pair[:splitIdx]), strings.TrimSpace(pair[splitIdx:]) + } + + return "", "" +} + +// splitByDelimiter splits text by delimiter, respecting quotes and nesting. +func (d *Decoder) splitByDelimiter(text string, delim rune) []string { + var parts []string + var current strings.Builder + inQuote := false + var quoteChar rune + depth := 0 + + for i, char := range text { + // Handle escaped characters + if char == '\\' && i+1 < len(text) { + current.WriteRune(char) + // We'll get the next character in the next iteration + continue + } + + if i > 0 && text[i-1] == '\\' { + current.WriteRune(char) + continue + } + + if char == '"' || char == '\'' { + if !inQuote { + inQuote = true + quoteChar = char + } else if char == quoteChar { + inQuote = false + quoteChar = 0 + } + current.WriteRune(char) + } else if !inQuote { + if char == '{' || char == '[' { + depth++ + current.WriteRune(char) + } else if char == '}' || char == ']' { + depth-- + current.WriteRune(char) + } else if char == delim && depth == 0 { + parts = append(parts, current.String()) + current.Reset() + } else { + current.WriteRune(char) + } + } else { + current.WriteRune(char) + } + } + + if current.Len() > 0 { + parts = append(parts, current.String()) + } + + return parts +} + +// parsePrimitive parses a primitive value. +func (d *Decoder) parsePrimitive(val string) any { + trimmed := strings.TrimSpace(val) + valLower := strings.ToLower(trimmed) + + // Booleans + if valLower == "t" || valLower == "true" { + return true + } + if valLower == "f" || valLower == "false" { + return false + } + + // Null + if valLower == "null" || valLower == "none" || valLower == "nil" { + return nil + } + + // Quoted string (JSON style) + if strings.HasPrefix(trimmed, `"`) { + var s string + if err := json.Unmarshal([]byte(trimmed), &s); err == nil { + return s + } + } + + // Try number + if trimmed != "" { + if i, err := strconv.ParseInt(trimmed, 10, 64); err == nil { + return float64(i) + } + if f, err := strconv.ParseFloat(trimmed, 64); err == nil { + return f + } + } + + // String + return trimmed +} + +// parseValue parses a cell value. +func (d *Decoder) parseValue(val string) (any, error) { + trimmed := strings.TrimSpace(val) + + // Quoted string (JSON style) + if strings.HasPrefix(trimmed, `"`) { + var decoded any + if err := json.Unmarshal([]byte(trimmed), &decoded); err == nil { + // If decoded value is a string that looks like a ZON structure, parse it recursively + if s, ok := decoded.(string); ok { + stripped := strings.TrimSpace(s) + if strings.HasPrefix(stripped, "{") || strings.HasPrefix(stripped, "[") { + return d.parseZonNode(stripped, 0) + } + } + return decoded, nil + } + + // Fallback: CSV unquoting + if strings.HasSuffix(trimmed, `"`) { + unquoted := trimmed[1 : len(trimmed)-1] + unquoted = strings.ReplaceAll(unquoted, `""`, `"`) + + // Try to parse unquoted value as JSON + var decoded any + if err := json.Unmarshal([]byte(unquoted), &decoded); err == nil { + if s, ok := decoded.(string); ok { + stripped := strings.TrimSpace(s) + if strings.HasPrefix(stripped, "{") || strings.HasPrefix(stripped, "[") { + return d.parseZonNode(stripped, 0) + } + } + return decoded, nil + } + + // Check for ZON structure in unquoted string + stripped := strings.TrimSpace(unquoted) + if strings.HasPrefix(stripped, "{") || strings.HasPrefix(stripped, "[") { + return d.parseZonNode(stripped, 0) + } + + return unquoted, nil + } + } + + // Booleans (case-insensitive) + valLower := strings.ToLower(trimmed) + if valLower == "t" || valLower == "true" { + return true, nil + } + if valLower == "f" || valLower == "false" { + return false, nil + } + + // Null + if valLower == "null" || valLower == "none" || valLower == "nil" { + return nil, nil + } + + // Check for ZON-style nested structures + if strings.HasPrefix(trimmed, "{") || strings.HasPrefix(trimmed, "[") { + return d.parseZonNode(trimmed, 0) + } + + // Try number + if trimmed != "" { + if i, err := strconv.ParseInt(trimmed, 10, 64); err == nil { + return float64(i), nil + } + if f, err := strconv.ParseFloat(trimmed, 64); err == nil { + return f, nil + } + } + + // Double-encoded JSON string fallback + if strings.HasPrefix(trimmed, `"`) && strings.HasSuffix(trimmed, `"`) { + var s string + if err := json.Unmarshal([]byte(trimmed), &s); err == nil { + return s, nil + } + } + + return trimmed, nil +} + +// unflatten unflattens dictionary with dotted keys. +func (d *Decoder) unflatten(data map[string]any) any { + result := make(map[string]any) + + for key, value := range data { + if !strings.Contains(key, ".") { + result[key] = value + continue + } + + parts := strings.Split(key, ".") + + // SECURITY: Prevent prototype pollution + skip := false + for _, p := range parts { + if p == "__proto__" || p == "constructor" || p == "prototype" { + skip = true + break + } + } + if skip { + continue + } + + target := result + for i := 0; i < len(parts)-1; i++ { + part := parts[i] + + if _, ok := target[part]; !ok { + target[part] = make(map[string]any) + } + + if nested, ok := target[part].(map[string]any); ok { + target = nested + } else { + break + } + } + + finalKey := parts[len(parts)-1] + target[finalKey] = value + } + + return result +} + +// truncateString truncates a string to the specified length. +func truncateString(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen] + "..." +} + +// Decode decodes ZON format string to original data. +// Default is strict mode. +func Decode(data string) (any, error) { + return NewDecoder(&DecodeOptions{Strict: true}).Decode(data) +} + +// DecodeWithOptions decodes ZON format string with custom options. +func DecodeWithOptions(data string, options *DecodeOptions) (any, error) { + return NewDecoder(options).Decode(data) +} diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..e808bc6 --- /dev/null +++ b/doc.go @@ -0,0 +1,33 @@ +// Package zon provides a ZON (Zero Overhead Notation) encoder and decoder. +// +// ZON is a compact, human-readable format for encoding JSON data, +// optimized for LLM token efficiency. It achieves 35-50% token reduction +// vs JSON through tabular encoding, single-character primitives, and +// intelligent compression while maintaining 100% data fidelity. +// +// Basic usage: +// +// import "github.com/ZON-Format/zon-go" +// +// // Encode data to ZON format +// data := map[string]any{ +// "users": []any{ +// map[string]any{"id": 1, "name": "Alice", "active": true}, +// map[string]any{"id": 2, "name": "Bob", "active": false}, +// }, +// } +// zonStr, err := zon.Encode(data) +// +// // Decode ZON format back to data +// decoded, err := zon.Decode(zonStr) +// +// The encoder automatically detects uniform arrays of objects and encodes +// them in tabular format for maximum token efficiency: +// +// users:@(2):active,id,name +// T,1,Alice +// F,2,Bob +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon diff --git a/docs/api-reference.md b/docs/api-reference.md new file mode 100644 index 0000000..0e36fa1 --- /dev/null +++ b/docs/api-reference.md @@ -0,0 +1,316 @@ +# ZON API Reference + +Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) + +Complete API documentation for `zon-go` v1.0.5. + +## Installation + +```bash +go get github.com/ZON-Format/zon-go +``` + +--- + +## Encoding Functions + +### `Encode(data any) (string, error)` + +Encodes Go data to ZON format. + +**Parameters:** +- `data` (`any`) - Go data to encode (maps, slices, primitives) + +**Returns:** `(string, error)` - ZON-formatted string or error + +**Example:** +```go +import zon "github.com/ZON-Format/zon-go" + +data := map[string]any{ + "users": []any{ + map[string]any{"id": 1, "name": "Alice", "active": true}, + map[string]any{"id": 2, "name": "Bob", "active": false}, + }, +} + +encoded, err := zon.Encode(data) +if err != nil { + log.Fatal(err) +} +fmt.Println(encoded) +``` + +**Output:** +```zon +users:@(2):active,id,name +T,1,Alice +F,2,Bob +``` + +**Supported Types:** +- ✅ Maps (`map[string]any`) +- ✅ Slices (`[]any`) +- ✅ Strings +- ✅ Numbers (int, int64, float64) +- ✅ Booleans (`T`/`F`) +- ✅ Nil (`null`) + +**Encoding Behavior:** +- **Uniform arrays** → Table format (`@(N):columns`) +- **Nested objects** → Quoted notation (`"{key:value}"`) +- **Primitive arrays** → Inline format (`"[a,b,c]"`) +- **Booleans** → `T`/`F` (single character) +- **Null** → `null` + +--- + +## Decoding Functions + +### `Decode(zonString string) (any, error)` + +Decodes a ZON format string back to the original Go data structure. +Uses strict mode by default. + +**Parameters:** +- `zonString` (`string`): The ZON-formatted string to decode + +**Returns:** `(any, error)` - Decoded data or error + +**Example:** +```go +import zon "github.com/ZON-Format/zon-go" + +zonData := `users:@(2):id,name +1,Alice +2,Bob` + +decoded, err := zon.Decode(zonData) +if err != nil { + log.Fatal(err) +} +fmt.Println(decoded) +``` + +**Output:** +```go +map[string]any{ + "users": []any{ + map[string]any{"id": 1, "name": "Alice"}, + map[string]any{"id": 2, "name": "Bob"}, + }, +} +``` + +### `DecodeWithOptions(zonString string, options *DecodeOptions) (any, error)` + +Decodes ZON with custom options. + +**Parameters:** +- `zonString` (`string`): The ZON-formatted string to decode +- `options` (`*DecodeOptions`): Decoding options + +**DecodeOptions:** +```go +type DecodeOptions struct { + Strict bool // Enable strict validation (default: true) +} +``` + +**Example:** +```go +// Non-strict mode - allows row/field count mismatches +decoded, err := zon.DecodeWithOptions(zonData, &zon.DecodeOptions{Strict: false}) +``` + +--- + +## Error Types + +### `DecodeError` + +Error type for decoding failures and strict mode validation errors. + +**Fields:** +- `Code` (`string`): Error code (e.g., "E001", "E002") +- `Message` (`string`): Error description +- `Line` (`int`): Line number (0 if unknown) +- `Column` (`int`): Column position (0 if unknown) +- `Context` (`string`): Context snippet + +**Example:** +```go +import zon "github.com/ZON-Format/zon-go" + +decoded, err := zon.Decode(invalidZon) +if err != nil { + if decodeErr, ok := err.(*zon.DecodeError); ok { + fmt.Println(decodeErr.Code) // "E001" + fmt.Println(decodeErr.Message) // "Row count mismatch..." + fmt.Println(decodeErr.Context) // "Table: users" + } +} +``` + +### Common Error Codes + +| Code | Description | Example | +|------|-------------|---------| +| `E001` | Row count mismatch | Declared `@(3)` but only 2 rows provided | +| `E002` | Field count mismatch | Declared 3 columns but row has 2 values | +| `E301` | Document size exceeds 100MB | Prevents memory exhaustion | +| `E302` | Line length exceeds 1MB | Prevents buffer overflow | +| `E303` | Array length exceeds 1M items | Prevents excessive iteration | +| `E304` | Object key count exceeds 100K | Prevents hash collision | + +--- + +## Constants + +### Security Limits + +```go +const ( + MaxDocumentSize = 100 * 1024 * 1024 // 100 MB + MaxLineLength = 1024 * 1024 // 1 MB + MaxArrayLength = 1_000_000 // 1 million items + MaxObjectKeys = 100_000 // 100K keys + MaxNestingDepth = 100 // 100 levels +) +``` + +### Error Codes + +```go +const ( + ErrRowCountMismatch = "E001" + ErrFieldCountMismatch = "E002" + ErrDocumentTooLarge = "E301" + ErrLineTooLong = "E302" + ErrArrayTooLarge = "E303" + ErrTooManyKeys = "E304" +) +``` + +--- + +## Complete Examples + +### Example 1: Simple Object + +```go +data := map[string]any{ + "name": "ZON Format", + "version": "1.0.5", + "active": true, + "score": 98.5, +} + +encoded, _ := zon.Encode(data) +// active:T +// name:ZON Format +// score:98.5 +// version:"1.0.5" + +decoded, _ := zon.Decode(encoded) +// map[string]any{"name": "ZON Format", "version": "1.0.5", "active": true, "score": 98.5} +``` + +### Example 2: Uniform Table + +```go +data := map[string]any{ + "employees": []any{ + map[string]any{"id": 1, "name": "Alice", "dept": "Eng", "salary": 85000}, + map[string]any{"id": 2, "name": "Bob", "dept": "Sales", "salary": 72000}, + map[string]any{"id": 3, "name": "Carol", "dept": "HR", "salary": 65000}, + }, +} + +encoded, _ := zon.Encode(data) +// employees:@(3):dept,id,name,salary +// Eng,1,Alice,85000 +// Sales,2,Bob,72000 +// HR,3,Carol,65000 + +decoded, _ := zon.Decode(encoded) +// Identical to original! +``` + +### Example 3: Mixed Structure + +```go +data := map[string]any{ + "metadata": map[string]any{"version": "1.0", "env": "prod"}, + "users": []any{ + map[string]any{"id": 1, "name": "Alice"}, + map[string]any{"id": 2, "name": "Bob"}, + }, + "tags": []any{"nodejs", "typescript", "llm"}, +} + +encoded, _ := zon.Encode(data) +decoded, _ := zon.Decode(encoded) +// Identical to original! +``` + +--- + +## Round-Trip Compatibility + +ZON **guarantees lossless round-trips**: + +```go +import zon "github.com/ZON-Format/zon-go" + +func testRoundTrip(data any) bool { + encoded, err := zon.Encode(data) + if err != nil { + return false + } + decoded, err := zon.Decode(encoded) + if err != nil { + return false + } + return reflect.DeepEqual(data, decoded) +} + +// All these pass: +testRoundTrip(map[string]any{"name": "test", "value": 123}) // ✅ +testRoundTrip([]any{1, 2, 3, 4, 5}) // ✅ +testRoundTrip([]any{map[string]any{"id": 1}}) // ✅ +testRoundTrip(nil) // ✅ +testRoundTrip("hello") // ✅ +``` + +**Verified:** +- ✅ 94/94 unit tests pass +- ✅ All datasets verified +- ✅ Zero data loss + +--- + +## CLI Usage + +Build and install: +```bash +go install github.com/ZON-Format/zon-go/cmd/zon@latest +``` + +Commands: +```bash +# Encode JSON to ZON +zon encode data.json > data.zonf + +# Decode ZON to JSON +zon decode data.zonf > output.json +``` + +--- + +## See Also + +- [Syntax Cheatsheet](./syntax-cheatsheet.md) - Quick reference +- [Format Specification](../SPEC.md) - Formal grammar +- [LLM Best Practices](./llm-best-practices.md) - Usage guide +- [GitHub Repository](https://github.com/ZON-Format/zon-go) diff --git a/docs/llm-best-practices.md b/docs/llm-best-practices.md new file mode 100644 index 0000000..92181d6 --- /dev/null +++ b/docs/llm-best-practices.md @@ -0,0 +1,336 @@ +# Using ZON with LLMs - Best Practices + +Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) + +Guide for maximizing ZON's effectiveness in LLM applications. + +## Why ZON for LLMs? + +LLM API costs are directly tied to token count. ZON reduces tokens by **23.8% vs JSON** while achieving **100% retrieval accuracy**. + +**Key Benefits:** +- 💰 **Lower costs**: Fewer tokens = lower API bills +- 🎯 **Better accuracy**: 100% vs JSON's 91.7% +- 📊 **Self-documenting**: Explicit headers `@(N):columns` +- 🔍 **Human-readable**: Easy to debug and verify + +--- + +## Sending ZON as Input + +### Basic Pattern + +Wrap ZON data in code blocks with format label: + +````markdown +Here's the user data in ZON format: + +```zon +users:@(3):active,id,name,role +T,1,Alice,admin +T,2,Bob,user +F,3,Carol,guest +``` + +Question: How many active users are there? +```` + +**Why this works:** +- ✅ Code blocks prevent formatting issues +- ✅ `zon` label helps model recognize format +- ✅ Explicit headers (`@(3):columns`) give clear schema + +### Alternative: No Code Block + +For simple queries, code blocks aren't required: + +``` +Data: +users:@(3):id,name,active +1,Alice,T +2,Bob,F +3,Carol,T + +Question: List all active users. +``` + +--- + +## Prompting Strategies + +### Strategy 1: Show the Format (No Explanation) + +**Best approach** - Let the model infer the structure: + +```` +```zon +products:@(4):category,id,name,price,stock +Electronics,1,Laptop,999,45 +Books,2,Python Guide,29.99,120 +Electronics,3,Mouse,19.99,200 +Books,4,JavaScript Basics,24.95,85 +``` + +Find products with stock below 100. +```` + +**Why it works:** The explicit headers (`@(4):category,id,name,price,stock`) are self-documenting. + +### Strategy 2: Minimal Context + +For complex queries, add brief context: + +```` +Data format: ZON (tabular) +@(N) = row count +Column names listed in header + +```zon +logs:@(100):level,message,timestamp,userId +ERROR,Database timeout,2025-01-15T10:30:00Z,1001 +WARN,High memory usage,2025-01-15T10:31:15Z,1002 +ERROR,API rate limit,2025-01-15T10:32:45Z,1001 +... +``` + +How many ERROR logs are from userId 1001? +```` + +--- + +## Common Use Cases + +### 1. Data Retrieval Questions + +**Perfect for ZON** - table format excels here: + +```` +```zon +employees:@(20):active,department,id,name,salary +T,Engineering,1,Alice Chen,95000 +T,Sales,2,Bob Smith,75000 +F,Marketing,3,Carol Lee,68000 +... +``` + +Questions: +1. What's the average salary in Engineering? +2. How many inactive employees are there? +3. List all Sales department employees. +```` + +### 2. Aggregation Tasks + +```` +```zon +transactions:@(1000):amount,category,date,userId +45.99,groceries,2025-01-10,1001 +120.00,electronics,2025-01-10,1002 +23.50,groceries,2025-01-11,1001 +... +``` + +Calculate total spending by category for userId 1001. +```` + +### 3. Filtering and Search + +```` +```zon +products:@(500):category,inStock,name,price,rating +Electronics,T,Laptop Pro,1299,4.5 +Books,F,Python Guide,29.99,4.8 +Electronics,T,USB Mouse,19.99,4.2 +... +``` + +Find all in-stock Electronics with rating above 4.0. +```` + +### 4. Structure Awareness + +```` +```zon +metadata:"{version:1.0.5,env:production,deployed:2025-01-15}" +users:@(5):id,name,active +1,Alice,T +2,Bob,F +3,Carol,T +4,Dan,T +5,Eve,F +config:"{database:{host:localhost,port:5432},cache:{ttl:3600}}" +``` + +Questions: +- What are the top-level keys? +- How many users are in the dataset? +- What's the database port? +```` + +--- + +## Validation and Error Handling + +### Ask Model to Validate + +```` +```zon +users:@(3):id,name,active +1,Alice,T +2,Bob,F +``` + +Before answering: verify the data has exactly 3 rows as declared. +Then answer: How many users are active? +```` + +### Handle Missing Data + +```` +```zon +products:@(4):id,name,price,stock +1,Laptop,999,45 +2,Mouse,19.99,null +3,Keyboard,79.99,0 +4,Monitor,299,15 +``` + +Note: `null` means missing value. +Question: Which products have unknown stock levels? +```` + +--- + +## Optimizing Token Usage + +### Tip 1: Use Compact Field Names + +```zon +# Good ✅ (shorter column names) +u:@(100):id,n,e,a +1,Alice,alice@ex.com,T +2,Bob,bob@ex.com,F + +# Acceptable ❌ (verbose names) +users:@(100):userId,fullName,emailAddress,isActive +1,Alice,alice@ex.com,true +2,Bob,bob@ex.com,false +``` + +**Token savings:** ~20% with compact names + +### Tip 2: Boolean Shorthand + +ZON uses `T`/`F` instead of `true`/`false`: + +```zon +users:@(100):id,name,active,verified +1,Alice,T,T +2,Bob,F,T +3,Carol,T,F +``` + +**Token savings:** ~40% on boolean fields + +### Tip 3: Null Handling + +ZON uses explicit `null`: + +```zon +data:@(50):id,value,note +1,100,null +2,null,Missing value +3,200,null +``` + +**Token savings:** Consistent with JSON, but unambiguous type. + +--- + +## Testing LLM Comprehension + +### Benchmark Your Model + +Test with simple queries first: + +```` +```zon +test:@(3):id,value +1,100 +2,200 +3,300 +``` + +1. How many rows? (Answer: 3) +2. What's the sum of values? (Answer: 600) +3. What's the average? (Answer: 200) +```` + +If model gets these right → ready for complex queries! + +### Common Failure Modes + +1. **Counting mismatch**: Model counts incorrectly + - **Fix**: Add explicit count in question: "The data has @(N) rows..." + +2. **Type confusion**: Model treats `T` as string not boolean + - **Fix**: Remind: "`T`=true, `F`=false" + +3. **Missing columns**: Model assumes column exists + - **Fix**: Headers are explicit - validate first + +--- + +## Complete Example: E-Commerce Query + +```` +Here's today's sales data in ZON format: + +```zon +orders:@(245):amount,category,customerId,orderId,status +129.99,electronics,C1001,ORD5001,shipped +45.50,books,C1002,ORD5002,pending +89.99,electronics,C1001,ORD5003,shipped +23.99,books,C1003,ORD5004,delivered +199.99,electronics,C1004,ORD5005,shipped +... +``` + +Questions: +1. How many orders are from customer C1001? +2. What's the total revenue from electronics? +3. How many orders are still pending? +4. What's the average order value? + +Please analyze the data and provide numerical answers. +```` + +**Why this works:** +- Clear format with `@(245)` count +- Explicit column headers +- Self-documenting structure +- No ambiguity + +--- + +## Quick Reference + +### Do's ✅ +- Use code blocks for formatting +- Include `@(N)` row counts +- List column names explicitly +- Use `T`/`F` for booleans +- Use `null` for null values + +### Don'ts ❌ +- Don't explain ZON syntax (show, don't tell) +- Don't mix formats (stick to ZON) +- Don't omit row counts +- Don't use verbose field names unnecessarily + +--- + +**See also:** +- [Syntax Cheatsheet](./syntax-cheatsheet.md) - Quick reference +- [API Reference](./api-reference.md) - encode/decode functions +- [Format Specification](../SPEC.md) - Formal grammar diff --git a/docs/syntax-cheatsheet.md b/docs/syntax-cheatsheet.md new file mode 100644 index 0000000..b188cbe --- /dev/null +++ b/docs/syntax-cheatsheet.md @@ -0,0 +1,276 @@ +# ZON Syntax Cheatsheet + +Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) + +Quick reference for ZON format syntax. Cross-referenced with actual implementation in v1.0.5. + +## Basic Types + +### Primitives + +```zon +# String (unquoted when safe) +name:Alice + +# Number +score:98.5 +count:42 + +# Boolean (T/F) +active:T +disabled:F + +# Null +value:null +``` + +### Objects + +```zon +# Simple object +name:ZON Format +version:1.0.5 +active:T +score:98.5 +``` + +**JSON equivalent:** +```json +{ + "name": "ZON Format", + "version": "1.0.5", + "active": true, + "score": 98.5 +} +``` + +### Nested Objects + +**Colon-less Syntax (v2.0.5):** +```zon +# Colon is optional if value starts with { or [ +config{database{host:localhost,port:5432},cache{ttl:3600,enabled:T}} +``` + +**Legacy Quoted (v1.x):** +```zon +config:"{database:{host:localhost,port:5432}}" +``` + +--- + +## Arrays + +### Primitive Arrays (Inline) + +```zon +tags:"[nodejs,typescript,llm]" +numbers:"[1,2,3,4,5]" +flags:"[T,F,T]" +``` + +### Tabular Arrays (Uniform Objects) + +**Most efficient form - ZON's specialty** + +```zon +users:@(3):active,id,name,role +T,1,Alice,admin +T,2,Bob,user +F,3,Carol,guest +``` + +**Breakdown:** +- `@(3)` = 3 rows +- `:active,id,name,role` = column headers +- Data rows follow + +**JSON equivalent:** +```json +{ + "users": [ + { "id": 1, "name": "Alice", "role": "admin", "active": true }, + { "id": 2, "name": "Bob", "role": "user", "active": true }, + { "id": 3, "name": "Carol", "role": "guest", "active": false } + ] +} +``` + +### Empty Containers + +```zon +# Empty object +metadata:"{}" + +# Empty array +tags:"[]" +``` + +--- + +## Quoting Rules + +### When Strings NEED Quotes + +1. **Contains special characters**: + - Commas: `"hello, world"` + - Colons: `"key:value"` + - Brackets: `"[test]"` + - Braces: `"{test}"` + +2. **Looks like a literal**: + - `"true"` (string, not boolean) + - `"123"` (string, not number) + - `"false"` (string, not boolean) + - `"null"` (string, not null) + +3. **Leading/trailing spaces**: + - `" padded "` + +4. **Empty string**: + - `""` (MUST quote, otherwise parses as `null`) + +### Safe Unquoted Strings + +```zon +# Alphanumeric + dash, underscore, dot +name:john-doe +file:data_v1.json +host:api.example.com +``` + +--- + +## Table Headers + +### Basic Header (with count) + +```zon +users:@(2):id,name,active +1,Alice,T +2,Bob,F +``` + +**Best practice**: Always include count `@(N)` for explicit schema + +--- + +## Type Conversions + +| ZON | JSON | Notes | +|-----|------|-------| +| `T` | `true` | Boolean true | +| `F` | `false` | Boolean false | +| `null` | `null` | Null value | +| `42` | `42` | Number (integer) | +| `3.14` | `3.14` | Number (float) | +| `hello` | `"hello"` | Unquoted string | +| `"hello"` | `"hello"` | Quoted string | + +--- + +## Common Patterns + +### Config with nested data + +```zon +environment:production +version:"1.0.5" +database:"{host:db.example.com,port:5432,ssl:T}" +features:"{darkMode:F,betaAccess:T}" +``` + +### Mixed structure (tables + metadata) + +```zon +created:"2025-11-28" +total:150 +users:@(3):id,name,status +1,Alice,active +2,Bob,inactive +3,Carol,active +``` + +### Root-level table + +```zon +@(2):id,name,active +1,Alice,T +2,Bob,F +``` + +--- + +## Escape Sequences + +Within quoted strings: +- `\"` - Double quote +- `\\` - Backslash +- `\n` - Newline +- `\r` - Carriage return +- `\t` - Tab + +**Example:** +```zon +message:"Line 1\nLine 2" +path:"C:\\Users\\data" +``` + +--- + +## Complete Example + +**JSON:** +```json +{ + "metadata": { "version": "1.0.5", "env": "production" }, + "users": [ + { "id": 1, "name": "Alice", "active": true, "loginCount": 42 }, + { "id": 2, "name": "Bob", "active": true, "loginCount": 17 }, + { "id": 3, "name": "Carol", "active": false, "loginCount": 3 } + ], + "config": { "database": { "host": "localhost", "port": 5432 } } +} +``` + +**ZON:** +```zon +metadata{version:1.0.5,env:production} +users:@(3):active,id,loginCount,name +T,1,42,Alice +T,2,17,Bob +F,3,3,Carol +config.database{host:localhost,port:5432} +``` + +**Token savings: ~42%** + +--- + +## Tips for LLMs + +1. **Use code blocks**: Wrap ZON in ` ```zon` for syntax highlighting +2. **No hints needed**: Format is self-documenting +3. **Explicit headers**: `@(N)` count helps LLMs validate +4. **Column names**: Listed once, clear schema + +**Example prompt:** +```` +Here's the data in ZON format: + +```zon +users:@(3):id,name,active +1,Alice,T +2,Bob,F +3,Carol,T +``` + +Question: How many active users are there? +```` + +--- + +**See also:** +- [Format Specification](../SPEC.md) - Formal grammar +- [API Reference](./api-reference.md) - encode/decode functions +- [LLM Best Practices](./llm-best-practices.md) - Usage guide diff --git a/encoder.go b/encoder.go new file mode 100644 index 0000000..f08329a --- /dev/null +++ b/encoder.go @@ -0,0 +1,780 @@ +// Package zon provides ZON encoding and decoding functionality. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon + +import ( + "encoding/json" + "fmt" + "math" + "regexp" + "sort" + "strconv" + "strings" +) + +// Encoder encodes Go data structures to ZON format. +type Encoder struct { + anchorInterval int + safeStrRegex *regexp.Regexp +} + +// NewEncoder creates a new ZON encoder with the given anchor interval. +func NewEncoder(anchorInterval int) *Encoder { + if anchorInterval <= 0 { + anchorInterval = DefaultAnchorInterval + } + return &Encoder{ + anchorInterval: anchorInterval, + safeStrRegex: regexp.MustCompile(`^[a-zA-Z0-9_\-\.]+$`), + } +} + +// Encode encodes data to ZON format. +func (e *Encoder) Encode(data any) (string, error) { + visited := make(map[uintptr]bool) + return e.encode(data, visited) +} + +func (e *Encoder) encode(data any, visited map[uintptr]bool) (string, error) { + // Extract primary stream (table data) and metadata + streamData, metadata, streamKey := e.extractPrimaryStream(data) + + // Fallback for simple/empty data + if streamData == nil && len(metadata) == 0 { + switch v := data.(type) { + case map[string]any: + if len(v) == 0 { + return "", nil + } + return e.formatZonNode(v, visited) + case []any: + return e.formatZonNode(v, visited) + default: + b, err := json.Marshal(data) + if err != nil { + return "", fmt.Errorf("failed to encode value of type %T: %w", data, err) + } + return string(b), nil + } + } + + // Special case: Detect schema uniformity for lists of dicts + if arr, ok := data.([]any); ok && len(arr) > 0 { + allObjects := true + for _, item := range arr { + if _, ok := item.(map[string]any); !ok { + allObjects = false + break + } + } + if allObjects { + irregularityScore := e.calculateIrregularity(arr) + if irregularityScore > 0.6 { + return e.formatZonNode(data, visited) + } + } + } + + // If streamKey is empty (pure list input), use default key + if streamData != nil && streamKey == "" { + streamKey = "data" + } + + var output []string + + // Write metadata + if len(metadata) > 0 { + metaLines := e.writeMetadata(metadata, visited) + output = append(output, metaLines...) + } + + // Write table + if streamData != nil && streamKey != "" { + if len(output) > 0 { + output = append(output, "") + } + tableLines, err := e.writeTable(streamData, streamKey, visited) + if err != nil { + return "", err + } + output = append(output, tableLines...) + } + + return strings.Join(output, "\n"), nil +} + +// extractPrimaryStream finds the main table in the data. +func (e *Encoder) extractPrimaryStream(data any) ([]any, map[string]any, string) { + if arr, ok := data.([]any); ok { + // Only promote to table if it contains objects + if len(arr) > 0 { + if _, isObj := arr[0].(map[string]any); isObj { + return arr, map[string]any{}, "" + } + } + // Root-level array of primitives + if len(arr) > 0 { + allPrimitives := true + for _, item := range arr { + switch item.(type) { + case map[string]any, []any: + allPrimitives = false + } + } + if allPrimitives { + return nil, map[string]any{}, "" + } + } + return nil, map[string]any{}, "" + } + + if obj, ok := data.(map[string]any); ok { + // Find largest list of objects + type candidate struct { + key string + arr []any + score int + } + var candidates []candidate + + for k, v := range obj { + if arr, ok := v.([]any); ok && len(arr) > 0 { + if firstObj, ok := arr[0].(map[string]any); ok { + score := len(arr) * len(firstObj) + candidates = append(candidates, candidate{k, arr, score}) + } + } + } + + if len(candidates) > 0 { + // Sort by score descending, then by key alphabetically + sort.Slice(candidates, func(i, j int) bool { + if candidates[i].score != candidates[j].score { + return candidates[i].score > candidates[j].score + } + return candidates[i].key < candidates[j].key + }) + + key := candidates[0].key + stream := candidates[0].arr + meta := make(map[string]any) + for k, v := range obj { + if k != key { + meta[k] = v + } + } + return stream, meta, key + } + } + + if obj, ok := data.(map[string]any); ok { + return nil, obj, "" + } + return nil, map[string]any{}, "" +} + +// writeMetadata writes metadata in YAML-like format. +func (e *Encoder) writeMetadata(metadata map[string]any, visited map[uintptr]bool) []string { + var lines []string + + // Flatten top-level objects (depth 1) + flattened := e.flatten(metadata, "", ".", 1, 0, visited) + + // Sort keys alphabetically + keys := make([]string, 0, len(flattened)) + for k := range flattened { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, key := range keys { + val := flattened[key] + valStr, _ := e.formatValue(val, visited) + + // Colon-less syntax for root metadata if value starts with { or [ + if strings.HasPrefix(valStr, "{") || strings.HasPrefix(valStr, "[") { + lines = append(lines, key+valStr) + } else { + lines = append(lines, key+string(MetaSeparator)+valStr) + } + } + + return lines +} + +// writeTable writes table in v2.0.0 compact format. +func (e *Encoder) writeTable(stream []any, key string, visited map[uintptr]bool) ([]string, error) { + if len(stream) == 0 { + return nil, nil + } + + // Flatten each row + flatStream := make([]map[string]any, len(stream)) + for i, row := range stream { + if rowMap, ok := row.(map[string]any); ok { + flatStream[i] = e.flatten(rowMap, "", ".", 0, 0, visited) + } else { + flatStream[i] = make(map[string]any) + } + } + + // Get all column names + colSet := make(map[string]bool) + for _, d := range flatStream { + for k := range d { + colSet[k] = true + } + } + + cols := make([]string, 0, len(colSet)) + for k := range colSet { + cols = append(cols, k) + } + sort.Strings(cols) + + // Analyze column sparsity + columnStats := e.analyzeColumnSparsity(flatStream, cols) + var coreColumns, optionalColumns []string + for _, stat := range columnStats { + if stat.presence >= 0.7 { + coreColumns = append(coreColumns, stat.name) + } else { + optionalColumns = append(optionalColumns, stat.name) + } + } + + // Decide encoding strategy + useSparseEncoding := len(optionalColumns) > 0 && len(optionalColumns) <= 5 + + if useSparseEncoding { + return e.writeSparseTable(flatStream, coreColumns, optionalColumns, len(stream), key, visited) + } + return e.writeStandardTable(flatStream, cols, len(stream), key, visited) +} + +// writeStandardTable writes a standard compact table. +func (e *Encoder) writeStandardTable(flatStream []map[string]any, cols []string, rowCount int, key string, visited map[uintptr]bool) ([]string, error) { + var lines []string + + // Build header + var header string + if key != "" && key != "data" { + header = fmt.Sprintf("%s%c%c(%d)", key, MetaSeparator, TableMarker, rowCount) + } else { + header = fmt.Sprintf("%c%d", TableMarker, rowCount) + } + + header += string(MetaSeparator) + strings.Join(cols, ",") + lines = append(lines, header) + + // Write rows + for _, row := range flatStream { + var tokens []string + for _, col := range cols { + val, ok := row[col] + if !ok || val == nil { + tokens = append(tokens, "null") + } else { + valStr, err := e.formatValue(val, visited) + if err != nil { + return nil, err + } + tokens = append(tokens, valStr) + } + } + lines = append(lines, strings.Join(tokens, ",")) + } + + return lines, nil +} + +// writeSparseTable writes a sparse table for semi-uniform data. +func (e *Encoder) writeSparseTable(flatStream []map[string]any, coreColumns, optionalColumns []string, rowCount int, key string, visited map[uintptr]bool) ([]string, error) { + var lines []string + + // Build header + var header string + if key != "" && key != "data" { + header = fmt.Sprintf("%s%c%c(%d)", key, MetaSeparator, TableMarker, rowCount) + } else { + header = fmt.Sprintf("%c%d", TableMarker, rowCount) + } + + header += string(MetaSeparator) + strings.Join(coreColumns, ",") + lines = append(lines, header) + + // Write rows + for _, row := range flatStream { + var tokens []string + + // Core columns + for _, col := range coreColumns { + valStr, _ := e.formatValue(row[col], visited) + tokens = append(tokens, valStr) + } + + // Optional columns as key:value if present + for _, col := range optionalColumns { + if val, ok := row[col]; ok && val != nil { + valStr, _ := e.formatValue(val, visited) + tokens = append(tokens, fmt.Sprintf("%s:%s", col, valStr)) + } + } + + lines = append(lines, strings.Join(tokens, ",")) + } + + return lines, nil +} + +// columnStat holds column sparsity information. +type columnStat struct { + name string + presence float64 +} + +// analyzeColumnSparsity analyzes column sparsity to determine core vs optional. +func (e *Encoder) analyzeColumnSparsity(data []map[string]any, cols []string) []columnStat { + stats := make([]columnStat, len(cols)) + for i, col := range cols { + presenceCount := 0 + for _, row := range data { + if val, ok := row[col]; ok && val != nil { + presenceCount++ + } + } + stats[i] = columnStat{ + name: col, + presence: float64(presenceCount) / float64(len(data)), + } + } + return stats +} + +// calculateIrregularity calculates the irregularity score for array of objects. +func (e *Encoder) calculateIrregularity(data []any) float64 { + if len(data) == 0 { + return 0 + } + + // Get all unique keys across all objects + allKeys := make(map[string]bool) + keySets := make([]map[string]bool, len(data)) + + for i, item := range data { + if obj, ok := item.(map[string]any); ok { + keySet := make(map[string]bool) + for k := range obj { + keySet[k] = true + allKeys[k] = true + } + keySets[i] = keySet + } else { + keySets[i] = make(map[string]bool) + } + } + + if len(allKeys) == 0 { + return 0 + } + + // Calculate key overlap score + var totalOverlap float64 + var comparisons int + + for i := 0; i < len(keySets); i++ { + for j := i + 1; j < len(keySets); j++ { + keys1 := keySets[i] + keys2 := keySets[j] + + // Count shared keys + shared := 0 + for k := range keys1 { + if keys2[k] { + shared++ + } + } + + // Jaccard similarity + union := len(keys1) + len(keys2) - shared + var similarity float64 + if union > 0 { + similarity = float64(shared) / float64(union) + } else { + similarity = 1 + } + + totalOverlap += similarity + comparisons++ + } + } + + if comparisons == 0 { + return 0 + } + + avgSimilarity := totalOverlap / float64(comparisons) + return 1 - avgSimilarity +} + +// csvQuote quotes a string for CSV (RFC 4180). +func (e *Encoder) csvQuote(s string) string { + escaped := strings.ReplaceAll(s, `"`, `""`) + return `"` + escaped + `"` +} + +// formatZonNode formats nested structure using ZON syntax. +func (e *Encoder) formatZonNode(val any, visited map[uintptr]bool) (string, error) { + switch v := val.(type) { + case nil: + return "null", nil + case bool: + if v { + return "T", nil + } + return "F", nil + case float64: + return e.formatNumber(v), nil + case float32: + return e.formatNumber(float64(v)), nil + case int: + return strconv.Itoa(v), nil + case int64: + return strconv.FormatInt(v, 10), nil + case int32: + return strconv.FormatInt(int64(v), 10), nil + case uint: + return strconv.FormatUint(uint64(v), 10), nil + case uint64: + return strconv.FormatUint(v, 10), nil + case string: + return e.formatString(v), nil + case map[string]any: + return e.formatObject(v, visited) + case []any: + return e.formatArray(v, visited) + default: + // Try JSON marshaling for other types + b, err := json.Marshal(v) + if err != nil { + return "", err + } + return string(b), nil + } +} + +// formatObject formats a map as ZON object. +func (e *Encoder) formatObject(obj map[string]any, visited map[uintptr]bool) (string, error) { + if len(obj) == 0 { + return "{}", nil + } + + // Sort keys + keys := make([]string, 0, len(obj)) + for k := range obj { + keys = append(keys, k) + } + sort.Strings(keys) + + var items []string + for _, k := range keys { + // Format key + kStr := k + if regexp.MustCompile(`[,:\{\}\[\]"]`).MatchString(kStr) { + b, _ := json.Marshal(kStr) + kStr = string(b) + } + + // Format value recursively + vStr, err := e.formatZonNode(obj[k], visited) + if err != nil { + return "", err + } + + // Colon-less Objects/Arrays + if strings.HasPrefix(vStr, "{") || strings.HasPrefix(vStr, "[") { + items = append(items, kStr+vStr) + } else { + items = append(items, kStr+":"+vStr) + } + } + return "{" + strings.Join(items, ",") + "}", nil +} + +// formatArray formats a slice as ZON array. +func (e *Encoder) formatArray(arr []any, visited map[uintptr]bool) (string, error) { + if len(arr) == 0 { + return "[]", nil + } + + var items []string + for _, item := range arr { + itemStr, err := e.formatZonNode(item, visited) + if err != nil { + return "", err + } + items = append(items, itemStr) + } + return "[" + strings.Join(items, ",") + "]", nil +} + +// formatNumber formats a number canonically without scientific notation. +func (e *Encoder) formatNumber(n float64) string { + // Handle special values + if math.IsNaN(n) || math.IsInf(n, 0) { + return "null" + } + + // Check if integer + if n == math.Trunc(n) { + return strconv.FormatInt(int64(n), 10) + } + + // Format float without scientific notation + s := strconv.FormatFloat(n, 'f', -1, 64) + + // Ensure decimal point for floats + if !strings.Contains(s, ".") { + s += ".0" + } + + return s +} + +// formatString formats a string with minimal quoting. +func (e *Encoder) formatString(s string) string { + // Always JSON-stringify strings with newlines + if strings.Contains(s, "\n") || strings.Contains(s, "\r") { + b, _ := json.Marshal(s) + return string(b) + } + + // ISO Date Detection + if e.isISODate(s) { + return s + } + + // Check if needs type protection + if e.needsTypeProtection(s) { + b, _ := json.Marshal(s) + return string(b) + } + + // Quote empty strings or whitespace-only strings + if strings.TrimSpace(s) == "" { + b, _ := json.Marshal(s) + return string(b) + } + + // Quote if contains structural delimiters + if regexp.MustCompile(`[,\{\}\[\]"]`).MatchString(s) { + b, _ := json.Marshal(s) + return string(b) + } + + return s +} + +// formatValue formats a value with minimal quoting. +func (e *Encoder) formatValue(val any, visited map[uintptr]bool) (string, error) { + if val == nil { + return "null", nil + } + + switch v := val.(type) { + case bool: + if v { + return "T", nil + } + return "F", nil + case float64: + return e.formatNumber(v), nil + case float32: + return e.formatNumber(float64(v)), nil + case int: + return strconv.Itoa(v), nil + case int64: + return strconv.FormatInt(v, 10), nil + case int32: + return strconv.FormatInt(int64(v), 10), nil + case uint: + return strconv.FormatUint(uint64(v), 10), nil + case uint64: + return strconv.FormatUint(v, 10), nil + case string: + return e.formatTableCellString(v), nil + case map[string]any, []any: + return e.formatZonNode(v, visited) + default: + b, err := json.Marshal(v) + if err != nil { + return "", err + } + return string(b), nil + } +} + +// formatTableCellString formats a string for use in a table cell. +func (e *Encoder) formatTableCellString(s string) string { + // Always JSON-stringify strings with newlines + if strings.Contains(s, "\n") || strings.Contains(s, "\r") { + b, _ := json.Marshal(s) + return e.csvQuote(string(b)) + } + + // ISO Date Detection + if e.isISODate(s) { + return s + } + + // Check if needs type protection + if e.needsTypeProtection(s) { + b, _ := json.Marshal(s) + return e.csvQuote(string(b)) + } + + // Check if it needs CSV quoting (delimiters) + if e.needsQuotes(s) { + return e.csvQuote(s) + } + + return s +} + +// isISODate checks if string is an ISO 8601 date/datetime. +func (e *Encoder) isISODate(s string) bool { + // ISO 8601 full datetime with timezone + if matched, _ := regexp.MatchString(`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})$`, s); matched { + return true + } + // ISO 8601 date only + if matched, _ := regexp.MatchString(`^\d{4}-\d{2}-\d{2}$`, s); matched { + return true + } + // Simple time + if matched, _ := regexp.MatchString(`^\d{2}:\d{2}:\d{2}$`, s); matched { + return true + } + return false +} + +// needsTypeProtection determines if string needs quoting to preserve as string. +func (e *Encoder) needsTypeProtection(s string) bool { + sLower := strings.ToLower(s) + + // Reserved words + reserved := []string{"t", "f", "true", "false", "null", "none", "nil"} + for _, r := range reserved { + if sLower == r { + return true + } + } + + // Gas/Liquid tokens + if s == string(GasToken) || s == string(LiquidToken) { + return true + } + + // Leading/trailing whitespace must be preserved + if strings.TrimSpace(s) != s { + return true + } + + // Control characters need JSON escaping + for _, c := range s { + if c < 32 { + return true + } + } + + // Pure integer + if matched, _ := regexp.MatchString(`^-?\d+$`, s); matched { + return true + } + + // Pure decimal + if matched, _ := regexp.MatchString(`^-?\d+\.\d+$`, s); matched { + return true + } + + // Scientific notation + if matched, _ := regexp.MatchString(`(?i)^-?\d+(\.\d+)?e[+-]?\d+$`, s); matched { + return true + } + + // If it starts/ends with digit but has non-numeric chars, check carefully + if len(s) > 0 && ((s[0] >= '0' && s[0] <= '9') || (s[len(s)-1] >= '0' && s[len(s)-1] <= '9')) { + // Try parsing - if it parses cleanly and matches, it's a number + if n, err := strconv.ParseFloat(s, 64); err == nil && strconv.FormatFloat(n, 'f', -1, 64) == s { + return true + } + } + + return false +} + +// needsQuotes determines if a string needs quotes. +func (e *Encoder) needsQuotes(s string) bool { + if s == "" { + return true + } + + // Reserved tokens need quoting + reserved := []string{"T", "F", "null", string(GasToken), string(LiquidToken)} + for _, r := range reserved { + if s == r { + return true + } + } + + // Quote if it looks like a number + if matched, _ := regexp.MatchString(`^-?\d+$`, s); matched { + return true + } + if _, err := strconv.ParseFloat(s, 64); err == nil { + return true + } + + // Quote if leading/trailing whitespace + if strings.TrimSpace(s) != s { + return true + } + + // Only quote if contains delimiter or control chars + if regexp.MustCompile(`[,\n\r\t"\[\]|;]`).MatchString(s) { + return true + } + + return false +} + +// flatten flattens nested dictionary with depth limit. +func (e *Encoder) flatten(d map[string]any, parent, sep string, maxDepth, currentDepth int, visited map[uintptr]bool) map[string]any { + result := make(map[string]any) + + for k, v := range d { + newKey := k + if parent != "" { + newKey = parent + sep + k + } + + if nested, ok := v.(map[string]any); ok && currentDepth < maxDepth { + // Recursively flatten this level + flattened := e.flatten(nested, newKey, sep, maxDepth, currentDepth+1, visited) + for fk, fv := range flattened { + result[fk] = fv + } + } else { + result[newKey] = v + } + } + + return result +} + +// Encode encodes data to ZON format using a default encoder. +func Encode(data any) (string, error) { + return NewEncoder(DefaultAnchorInterval).Encode(data) +} diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..a5a7ec5 --- /dev/null +++ b/errors.go @@ -0,0 +1,75 @@ +// Package zon provides ZON encoding and decoding functionality. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon + +import ( + "fmt" +) + +// DecodeError represents an error that occurred during ZON decoding. +type DecodeError struct { + Code string // Error code (e.g., "E001", "E002") + Message string // Detailed error message + Line int // Line number where error occurred (0 if unknown) + Column int // Column position (0 if unknown) + Context string // Relevant context snippet +} + +// Error implements the error interface. +func (e *DecodeError) Error() string { + msg := fmt.Sprintf("[%s] %s", e.Code, e.Message) + if e.Line > 0 { + msg += fmt.Sprintf(" (line %d)", e.Line) + } + if e.Context != "" { + msg += fmt.Sprintf("\n Context: %s", e.Context) + } + return msg +} + +// NewDecodeError creates a new DecodeError with the given parameters. +func NewDecodeError(code, message string, line int, context string) *DecodeError { + return &DecodeError{ + Code: code, + Message: message, + Line: line, + Context: context, + } +} + +// Error codes for ZON decoding errors +const ( + // ErrRowCountMismatch indicates the row count doesn't match the declared count + ErrRowCountMismatch = "E001" + // ErrFieldCountMismatch indicates the field count doesn't match the column count + ErrFieldCountMismatch = "E002" + // ErrMalformedHeader indicates an invalid table header format + ErrMalformedHeader = "E003" + // ErrDocumentTooLarge indicates the document exceeds the maximum size + ErrDocumentTooLarge = "E301" + // ErrLineTooLong indicates a line exceeds the maximum length + ErrLineTooLong = "E302" + // ErrArrayTooLarge indicates an array exceeds the maximum length + ErrArrayTooLarge = "E303" + // ErrTooManyKeys indicates an object has too many keys + ErrTooManyKeys = "E304" + // ErrNestingTooDeep indicates the nesting depth exceeds the maximum + ErrNestingTooDeep = "E305" +) + +// EncodeError represents an error that occurred during ZON encoding. +type EncodeError struct { + Message string +} + +// Error implements the error interface. +func (e *EncodeError) Error() string { + return e.Message +} + +// NewEncodeError creates a new EncodeError with the given message. +func NewEncodeError(message string) *EncodeError { + return &EncodeError{Message: message} +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..539832b --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/ZON-Format/zon-go + +go 1.21 diff --git a/security_limits_test.go b/security_limits_test.go new file mode 100644 index 0000000..86343ba --- /dev/null +++ b/security_limits_test.go @@ -0,0 +1,148 @@ +// Package zon_test provides security limit tests (DOS Prevention). +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "strconv" + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestDocumentSizeLimit tests document size limit. +func TestDocumentSizeLimit(t *testing.T) { + // Test normal document under limit + doc := strings.Repeat("test:value\n", 1000) + _, err := zon.Decode(doc) + if err != nil { + t.Errorf("Should allow documents under 100MB: %v", err) + } +} + +// TestLineLengthLimit tests line length limit. +func TestLineLengthLimit(t *testing.T) { + // Test long line exceeds 1MB + longLine := "key:" + strings.Repeat("x", zon.MaxLineLength+1) + + _, err := zon.Decode(longLine) + if err == nil { + t.Error("Expected error for long line") + } + decodeErr, ok := err.(*zon.DecodeError) + if !ok { + t.Errorf("Expected DecodeError, got %T", err) + } + if decodeErr.Code != zon.ErrLineTooLong { + t.Errorf("Expected E302, got %s", decodeErr.Code) + } +} + +// TestLineLengthUnderLimit tests lines under limit. +func TestLineLengthUnderLimit(t *testing.T) { + line := "key:" + strings.Repeat("x", 1000) + result, err := zon.Decode(line) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + if resultMap["key"] == nil { + t.Error("Expected key to be defined") + } +} + +// TestArrayLengthLimitDefined tests that array length limit is defined. +func TestArrayLengthLimitDefined(t *testing.T) { + // Verify the limit exists in constants + if zon.MaxArrayLength != 1_000_000 { + t.Errorf("Expected MaxArrayLength 1000000, got %d", zon.MaxArrayLength) + } +} + +// TestObjectKeyLimitDefined tests that object key limit is defined. +func TestObjectKeyLimitDefined(t *testing.T) { + // Verify the limit exists in constants + if zon.MaxObjectKeys != 100_000 { + t.Errorf("Expected MaxObjectKeys 100000, got %d", zon.MaxObjectKeys) + } +} + +// TestObjectsUnderKeyLimit tests objects under 100K keys. +func TestObjectsUnderKeyLimit(t *testing.T) { + var keys []string + for i := 0; i < 100; i++ { + keys = append(keys, "key"+strconv.Itoa(i)+":"+strconv.Itoa(i)) + } + zonData := `data:"{` + strings.Join(keys, ",") + `}"` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + dataMap := resultMap["data"].(map[string]any) + if len(dataMap) != 100 { + t.Errorf("Expected 100 keys, got %d", len(dataMap)) + } +} + +// TestNestingDepthExceeds100 tests that nesting exceeds 100 levels. +func TestNestingDepthExceeds100(t *testing.T) { + nested := strings.Repeat("[", 150) + strings.Repeat("]", 150) + + _, err := zon.Decode(nested) + if err == nil { + t.Error("Expected error for deep nesting") + } + if !strings.Contains(err.Error(), "Maximum nesting depth exceeded") { + t.Errorf("Expected nesting depth error, got: %v", err) + } +} + +// TestNestingDepthUnder100 tests nesting under 100 levels. +func TestNestingDepthUnder100(t *testing.T) { + nested := strings.Repeat("[", 50) + strings.Repeat("]", 50) + + result, err := zon.Decode(nested) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + if result == nil { + t.Error("Expected non-nil result") + } +} + +// TestCombinedLimitsWithNormalData tests normal data within all limits. +func TestCombinedLimitsWithNormalData(t *testing.T) { + zonData := `metadata:"{version:1.0.5,env:prod}" +users:@(3):id,name +1,Alice +2,Bob +3,Carol +tags:"[nodejs,typescript,llm]"` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 3 { + t.Errorf("Expected 3 users, got %d", len(users)) + } + + metadata := resultMap["metadata"].(map[string]any) + if metadata["version"] != "1.0.5" { + t.Errorf("Expected version 1.0.5, got %v", metadata["version"]) + } + + tags := resultMap["tags"].([]any) + if len(tags) != 3 { + t.Errorf("Expected 3 tags, got %d", len(tags)) + } +} diff --git a/security_test.go b/security_test.go new file mode 100644 index 0000000..baa1b5e --- /dev/null +++ b/security_test.go @@ -0,0 +1,193 @@ +// Package zon_test provides security and robustness tests. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestRejectProtoKeys tests rejection of __proto__ keys. +func TestRejectProtoKeys(t *testing.T) { + malicious := `users:@(1):id,__proto__.polluted +1,true` + + _, err := zon.Decode(malicious) + // Decoding should succeed but not pollute prototype + if err != nil { + return // Error is acceptable + } + + // Verify that object prototype is not polluted + testObj := make(map[string]any) + if _, ok := testObj["polluted"]; ok { + t.Error("Prototype pollution detected via __proto__") + } +} + +// TestRejectConstructorPrototypeKeys tests rejection of constructor.prototype keys. +func TestRejectConstructorPrototypeKeys(t *testing.T) { + malicious := `users:@(1):id,constructor.prototype.polluted +1,true` + + _, err := zon.Decode(malicious) + // Decoding should succeed but not pollute prototype + if err != nil { + return // Error is acceptable + } + + // Verify that object prototype is not polluted + testObj := make(map[string]any) + if _, ok := testObj["polluted"]; ok { + t.Error("Prototype pollution detected via constructor.prototype") + } +} + +// TestDOSDeepNesting tests denial of service via deep nesting. +func TestDOSDeepNesting(t *testing.T) { + depth := 150 + deepZon := strings.Repeat("[", depth) + "]" + strings.Repeat("]", depth-1) + + _, err := zon.Decode(deepZon) + if err == nil { + t.Error("Expected error for deep nesting") + } + if !strings.Contains(err.Error(), "Maximum nesting depth exceeded") { + t.Errorf("Expected nesting depth error, got: %v", err) + } +} + +// TestCircularReferenceInEncoder tests circular reference detection in encoder. +func TestCircularReferenceInEncoder(t *testing.T) { + // Note: In Go, we can't create true circular references with map[string]any + // like we can in JavaScript. The test verifies the encoder handles edge cases. + + // Create a deeply nested structure instead + data := map[string]any{ + "name": "loop", + "nested": map[string]any{ + "deep": map[string]any{ + "value": "test", + }, + }, + } + + _, err := zon.Encode(data) + if err != nil { + t.Errorf("Should handle nested structures: %v", err) + } +} + +// TestNestedStructuresEncode tests encoding of nested structures. +func TestNestedStructuresEncode(t *testing.T) { + data := map[string]any{ + "a": map[string]any{ + "b": map[string]any{ + "c": "value", + }, + }, + } + + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + a := decodedMap["a"].(map[string]any) + b := a["b"].(map[string]any) + if b["c"] != "value" { + t.Errorf("Expected 'value', got %v", b["c"]) + } +} + +// TestMalformedInput tests handling of malformed input. +func TestMalformedInput(t *testing.T) { + tests := []struct { + name string + input string + }{ + {"Unclosed bracket", "[1,2,3"}, + {"Unclosed brace", "{a:1,b:2"}, + {"Extra closing", "[1,2,3]]"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // These may or may not error, but should not panic + defer func() { + if r := recover(); r != nil { + t.Errorf("Panicked on input %q: %v", tt.input, r) + } + }() + zon.Decode(tt.input) + }) + } +} + +// TestSpecialCharactersInKeys tests special characters in keys. +func TestSpecialCharactersInKeys(t *testing.T) { + data := map[string]any{ + "normal": "value1", + "with-dash": "value2", + "with_under": "value3", + "with.dot": "value4", + } + + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + if decodedMap["normal"] != "value1" { + t.Errorf("normal: expected value1, got %v", decodedMap["normal"]) + } + if decodedMap["with-dash"] != "value2" { + t.Errorf("with-dash: expected value2, got %v", decodedMap["with-dash"]) + } +} + +// TestUnicodeStrings tests Unicode string handling. +func TestUnicodeStrings(t *testing.T) { + data := map[string]any{ + "chinese": "王小明", + "emoji": "✅🚀", + "arabic": "مرحبا", + } + + encoded, err := zon.Encode(data) + if err != nil { + t.Fatalf("Encode error: %v", err) + } + + decoded, err := zon.Decode(encoded) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + decodedMap := decoded.(map[string]any) + if decodedMap["chinese"] != "王小明" { + t.Errorf("chinese: expected 王小明, got %v", decodedMap["chinese"]) + } + if decodedMap["emoji"] != "✅🚀" { + t.Errorf("emoji: expected ✅🚀, got %v", decodedMap["emoji"]) + } + if decodedMap["arabic"] != "مرحبا" { + t.Errorf("arabic: expected مرحبا, got %v", decodedMap["arabic"]) + } +} diff --git a/strict_mode_test.go b/strict_mode_test.go new file mode 100644 index 0000000..3de904c --- /dev/null +++ b/strict_mode_test.go @@ -0,0 +1,314 @@ +// Package zon_test provides strict mode validation tests. +// +// Copyright (c) 2025 ZON-FORMAT (Roni Bhakta) +// License: MIT +package zon_test + +import ( + "strings" + "testing" + + zon "github.com/ZON-Format/zon-go" +) + +// TestRowCountMismatchStrictMode tests row count mismatch in strict mode. +func TestRowCountMismatchStrictMode(t *testing.T) { + zonData := `users:@(3):id,name +1,Alice +2,Bob` + + _, err := zon.Decode(zonData) + if err == nil { + t.Error("Expected error for row count mismatch") + } + + decodeErr, ok := err.(*zon.DecodeError) + if !ok { + t.Errorf("Expected DecodeError, got %T", err) + } + if decodeErr.Code != zon.ErrRowCountMismatch { + t.Errorf("Expected E001, got %s", decodeErr.Code) + } + if !strings.Contains(err.Error(), "Row count mismatch") { + t.Errorf("Expected row count mismatch error, got: %v", err) + } +} + +// TestRowCountMismatchNonStrictMode tests row count mismatch in non-strict mode. +func TestRowCountMismatchNonStrictMode(t *testing.T) { + zonData := `users:@(3):id,name +1,Alice +2,Bob` + + result, err := zon.DecodeWithOptions(zonData, &zon.DecodeOptions{Strict: false}) + if err != nil { + t.Fatalf("Should allow fewer rows in non-strict mode: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } +} + +// TestRowCountMatches tests that matching row count passes. +func TestRowCountMatches(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice +2,Bob` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } + + user1 := users[0].(map[string]any) + if user1["id"] != float64(1) { + t.Errorf("user1.id: expected 1, got %v", user1["id"]) + } + if user1["name"] != "Alice" { + t.Errorf("user1.name: expected Alice, got %v", user1["name"]) + } +} + +// TestFieldCountMismatchStrictMode tests field count mismatch in strict mode. +func TestFieldCountMismatchStrictMode(t *testing.T) { + zonData := `users:@(2):id,name,role +1,Alice +2,Bob,admin` + + _, err := zon.Decode(zonData) + if err == nil { + t.Error("Expected error for field count mismatch") + } + + decodeErr, ok := err.(*zon.DecodeError) + if !ok { + t.Errorf("Expected DecodeError, got %T", err) + } + if decodeErr.Code != zon.ErrFieldCountMismatch { + t.Errorf("Expected E002, got %s", decodeErr.Code) + } + if !strings.Contains(err.Error(), "Field count mismatch") { + t.Errorf("Expected field count mismatch error, got: %v", err) + } +} + +// TestFieldCountMismatchNonStrictMode tests field count mismatch in non-strict mode. +func TestFieldCountMismatchNonStrictMode(t *testing.T) { + zonData := `users:@(2):id,name,role +1,Alice +2,Bob,admin` + + result, err := zon.DecodeWithOptions(zonData, &zon.DecodeOptions{Strict: false}) + if err != nil { + t.Fatalf("Should allow missing fields in non-strict mode: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } + + user1 := users[0].(map[string]any) + if user1["id"] != float64(1) { + t.Errorf("user1.id: expected 1, got %v", user1["id"]) + } + if user1["name"] != "Alice" { + t.Errorf("user1.name: expected Alice, got %v", user1["name"]) + } + + user2 := users[1].(map[string]any) + if user2["id"] != float64(2) { + t.Errorf("user2.id: expected 2, got %v", user2["id"]) + } + if user2["role"] != "admin" { + t.Errorf("user2.role: expected admin, got %v", user2["role"]) + } +} + +// TestFieldCountMatchesStrictMode tests that matching field count passes. +func TestFieldCountMatchesStrictMode(t *testing.T) { + zonData := `users:@(2):id,name,role +1,Alice,user +2,Bob,admin` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } + + user1 := users[0].(map[string]any) + if user1["id"] != float64(1) { + t.Errorf("user1.id: expected 1, got %v", user1["id"]) + } + if user1["role"] != "user" { + t.Errorf("user1.role: expected user, got %v", user1["role"]) + } +} + +// TestSparseFieldsInStrictMode tests sparse fields in strict mode. +func TestSparseFieldsInStrictMode(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice,role:admin,score:98 +2,Bob` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + + user1 := users[0].(map[string]any) + if user1["id"] != float64(1) { + t.Errorf("user1.id: expected 1, got %v", user1["id"]) + } + if user1["name"] != "Alice" { + t.Errorf("user1.name: expected Alice, got %v", user1["name"]) + } + if user1["role"] != "admin" { + t.Errorf("user1.role: expected admin, got %v", user1["role"]) + } + if user1["score"] != float64(98) { + t.Errorf("user1.score: expected 98, got %v", user1["score"]) + } + + user2 := users[1].(map[string]any) + if user2["id"] != float64(2) { + t.Errorf("user2.id: expected 2, got %v", user2["id"]) + } + if user2["name"] != "Bob" { + t.Errorf("user2.name: expected Bob, got %v", user2["name"]) + } +} + +// TestErrorCodeInErrorObject tests error code in error object. +func TestErrorCodeInErrorObject(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice` + + _, err := zon.Decode(zonData) + if err == nil { + t.Fatal("Expected error") + } + + decodeErr, ok := err.(*zon.DecodeError) + if !ok { + t.Errorf("Expected DecodeError, got %T", err) + } + if decodeErr.Code != zon.ErrRowCountMismatch { + t.Errorf("Expected E001, got %s", decodeErr.Code) + } +} + +// TestContextInErrorMessage tests context in error message. +func TestContextInErrorMessage(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice` + + _, err := zon.Decode(zonData) + if err == nil { + t.Fatal("Expected error") + } + + decodeErr, ok := err.(*zon.DecodeError) + if !ok { + t.Errorf("Expected DecodeError, got %T", err) + } + if decodeErr.Context == "" { + t.Error("Expected context to be set") + } + if !strings.Contains(err.Error(), "Table: users") { + t.Errorf("Expected 'Table: users' in error, got: %v", err) + } +} + +// TestStrictModeEnabledByDefault tests that strict mode is enabled by default. +func TestStrictModeEnabledByDefault(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice` + + // Should throw because default is strict: true + _, err := zon.Decode(zonData) + if err == nil { + t.Error("Expected error in default strict mode") + } +} + +// TestExplicitStrictMode tests explicit strict mode. +func TestExplicitStrictMode(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice` + + _, err := zon.DecodeWithOptions(zonData, &zon.DecodeOptions{Strict: true}) + if err == nil { + t.Error("Expected error in explicit strict mode") + } +} + +// TestMultipleTablesIndependentValidation tests multiple tables independent validation. +func TestMultipleTablesIndependentValidation(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice +2,Bob +products:@(1):id,title +100,Widget` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } + + products := resultMap["products"].([]any) + if len(products) != 1 { + t.Errorf("Expected 1 product, got %d", len(products)) + } +} + +// TestValidDataAcrossMultipleTables tests valid data across multiple tables. +func TestValidDataAcrossMultipleTables(t *testing.T) { + zonData := `users:@(2):id,name +1,Alice +2,Bob +products:@(2):id,title +100,Widget +200,Gadget` + + result, err := zon.Decode(zonData) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + resultMap := result.(map[string]any) + users := resultMap["users"].([]any) + if len(users) != 2 { + t.Errorf("Expected 2 users, got %d", len(users)) + } + + products := resultMap["products"].([]any) + if len(products) != 2 { + t.Errorf("Expected 2 products, got %d", len(products)) + } +}