|
| 1 | +# postgresql-cst-parser |
| 2 | + |
| 3 | +**Note: This parser is not an official PostgreSQL project but an independent, unofficial tool.** |
| 4 | + |
| 5 | +## Overview |
| 6 | + |
| 7 | +`postgresql-cst-parser` is a PostgreSQL-specific Concrete Syntax Tree (CST) parser developed in Pure Rust. This document describes the parser's features, development motivation, usage, and implementation details. |
| 8 | + |
| 9 | +## Key Features |
| 10 | + |
| 11 | +- **PostgreSQL 17 Support**: Supports the latest PostgreSQL 17 syntax. |
| 12 | +- **Structured CST Output**: The generated CST strictly follows the structure defined in PostgreSQL's [gram.y](https://github.com/postgres/postgres/blob/REL_17_0/src/backend/parser/gram.y) file. |
| 13 | +- **Utilizing `cstree`**: Uses the `cstree` crate for building syntax trees. |
| 14 | +- **PL/pgSQL**: Currently not supported. |
| 15 | + |
| 16 | +## Development Motivation |
| 17 | + |
| 18 | +This project was developed because we needed a library that can be used from Rust, supports all syntax, and (being written in Pure Rust) can be used with wasm-bindgen. |
| 19 | + |
| 20 | +## Usage |
| 21 | + |
| 22 | +You can use it as follows: |
| 23 | + |
| 24 | +```rust |
| 25 | +use postgresql_cst_parser::{parse, syntax_kind::SyntaxKind}; |
| 26 | + |
| 27 | +fn main() { |
| 28 | + // Parse SQL query and get the syntax tree |
| 29 | + let sql = "SELECT tbl.a as a, tbl.b from TBL tbl WHERE tbl.a > 0;"; |
| 30 | + let root = parse(sql).unwrap(); |
| 31 | + |
| 32 | + // Example 1: Extract all column references from the query |
| 33 | + let column_refs: Vec<String> = root |
| 34 | + .descendants() |
| 35 | + .filter(|node| node.kind() == SyntaxKind::columnref) |
| 36 | + .map(|node| node.text().to_string()) |
| 37 | + .collect(); |
| 38 | + |
| 39 | + println!("Column references: {:?}", column_refs); // ["tbl.a", "tbl.b", "tbl.a"] |
| 40 | + |
| 41 | + // Example 2: Find the WHERE condition |
| 42 | + if let Some(where_clause) = root |
| 43 | + .descendants() |
| 44 | + .find(|node| node.kind() == SyntaxKind::where_clause) |
| 45 | + { |
| 46 | + println!("WHERE condition: {}", where_clause.text()); |
| 47 | + } |
| 48 | + |
| 49 | + // Example 3: Get the selected table name |
| 50 | + if let Some(relation_expr) = root |
| 51 | + .descendants() |
| 52 | + .find(|node| node.kind() == SyntaxKind::relation_expr) |
| 53 | + { |
| 54 | + if let Some(name_node) = relation_expr |
| 55 | + .descendants() |
| 56 | + .find(|node| node.kind() == SyntaxKind::ColId) |
| 57 | + { |
| 58 | + println!("Table name: {}", name_node.text()); |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + // Example 4: Parse complex SQL and extract specific nodes |
| 63 | + let complex_sql = "WITH data AS (SELECT id, value FROM source WHERE value > 10) |
| 64 | + SELECT d.id, d.value, COUNT(*) OVER (PARTITION BY d.id) |
| 65 | + FROM data d JOIN other o ON d.id = o.id |
| 66 | + ORDER BY d.value DESC LIMIT 10;"; |
| 67 | + |
| 68 | + let complex_root = parse(complex_sql).unwrap(); |
| 69 | + |
| 70 | + // Extract CTEs (Common Table Expressions) |
| 71 | + let ctes: Vec<_> = complex_root |
| 72 | + .descendants() |
| 73 | + .filter(|node| node.kind() == SyntaxKind::common_table_expr) |
| 74 | + .collect(); |
| 75 | + |
| 76 | + // Extract window functions |
| 77 | + let window_funcs: Vec<_> = complex_root |
| 78 | + .descendants() |
| 79 | + .filter(|node| node.kind() == SyntaxKind::over_clause) |
| 80 | + .collect(); |
| 81 | + |
| 82 | + println!("Number of CTEs: {}", ctes.len()); |
| 83 | + println!("Number of window functions: {}", window_funcs.len()); |
| 84 | +} |
| 85 | +``` |
| 86 | + |
| 87 | +Example of the generated syntax tree: |
| 88 | + |
| 89 | +```sql |
| 90 | +SELECT tbl.a as a from TBL tbl; |
| 91 | +``` |
| 92 | + |
| 93 | +``` |
| 94 | + |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | + |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | + |
| 115 | + |
| 116 | + |
| 117 | + |
| 118 | + |
| 119 | + |
| 120 | + |
| 121 | + |
| 122 | + |
| 123 | + |
| 124 | + |
| 125 | + |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | + |
| 130 | + |
| 131 | + |
| 132 | + |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | + |
| 138 | + |
| 139 | + |
| 140 | +``` |
| 141 | + |
| 142 | +If you'd like to try this parser directly, you can experience it online [here](https://tanzaku.github.io/postgresql-cst-parser/). |
| 143 | + |
| 144 | +## Implementation |
| 145 | + |
| 146 | +This implementation uses PostgreSQL's [scan.l](https://github.com/postgres/postgres/blob/REL_17_0/src/backend/parser/scan.l) and [gram.y](https://github.com/postgres/postgres/blob/REL_17_0/src/backend/parser/gram.y) with patches from [libpg_query](https://github.com/pganalyze/libpg_query/tree/17-6.0.0/patches) applied. `scan.l` has been further rewritten for Rust, and based on `scan.l` and `gram.y`, a syntax parsing table has been created to build the parser. |
| 147 | + |
| 148 | +## License |
| 149 | + |
| 150 | +- `kwlist.h`, `parser.c`, `scan.l`, `gram.y` are under the PostgreSQL License. |
| 151 | +- `lexer_ported.rs` and `generated.rs` contain code ported from PostgreSQL, so the ported parts are under the PostgreSQL License. |
| 152 | +- This project applies patches from [libpg_query](https://github.com/pganalyze/libpg_query) to `scan.l` and `gram.y`, but the patches themselves are not included in this repository. |
| 153 | +- Other files are published under the MIT License. |
0 commit comments