Skip to content

Commit aaa32a8

Browse files
authored
feat: complete c parser (#13)
* fix: fix hardcode in export.go * feat: cxx/c parser skeleton with clangd-18 * feat: initial support for cxx collect 1. Since clangd does not support semanticTokens/range method, use semanticTokens/full + filtering to emulate. 2. Since the concept of package and module does not apply to C/C++, treat the whole repo as a single package/module. * fix: get json.Types and build graph * fix: fix comments * test: duplicate c functions * refactor: clang-format c tests * fix: pkgpath -> file. skip build. C allows symbols with the same name in a single module, provided either: * One is a weak symbol (decl) and one is a strong symbol (def) * They are both strong symbols, but never linked together. The first one works fine, but more changes are needed for the second one. testdata/cxxsimple illustrates the first scenario. Two instances of `myself` are present, one (weak) in `pair.h` and one (strong) in `pair.c`. The dependency is well defined in this scneario: 1. `pair.c:myself` depends on `pair.h:myself` 2. any other function using `myself` depends on both. To verify, run `./abcoder parse cxx testdata/cxxsimple > cxxsimple.json`. testdata/cxxduplicate is the second scenario. Two strong instances of `add` are present, each used in a different executable. clangd handles this with compile_commands.json. If clangd is invoked as below, the `main->add` dependency shall point to the `add` in `d1/add.c`. mkdir build && cd build && cmake .. bear -- make prog1 # generate compile_commands.json cd testdata/cduplicate && clangd-18 While clangd does the right job, the current implementation of scanning during collection does not take into account which files are included in a compilation (as specified in compile_commands.json). So `Collector.Collect` will incorrectly include `d2/add.c` even if it is not used, and mess up with dependencies. That is to say, even for the compilation `prog1 <- main.c, d1/add.c`, a dependency `main->d2/add.c:add` will be present.
1 parent 45c43af commit aaa32a8

File tree

17 files changed

+473
-12
lines changed

17 files changed

+473
-12
lines changed

lang/collect/collect.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"strings"
2323
"unicode"
2424

25+
"github.com/cloudwego/abcoder/lang/cxx"
2526
"github.com/cloudwego/abcoder/lang/log"
2627
. "github.com/cloudwego/abcoder/lang/lsp"
2728
"github.com/cloudwego/abcoder/lang/rust"
@@ -79,6 +80,8 @@ func switchSpec(l uniast.Language) LanguageSpec {
7980
switch l {
8081
case uniast.Rust:
8182
return &rust.RustSpec{}
83+
case uniast.Cxx:
84+
return &cxx.CxxSpec{}
8285
default:
8386
panic(fmt.Sprintf("unsupported language %s", l))
8487
}

lang/collect/export.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ func (c *Collector) fileLine(loc Location) uniast.FileLine {
4848
}
4949
}
5050

51-
func newModule(name string, dir string) *uniast.Module {
52-
ret := uniast.NewModule(name, dir, uniast.Rust)
51+
func newModule(name string, dir string, lang uniast.Language) *uniast.Module {
52+
ret := uniast.NewModule(name, dir, lang)
5353
return ret
5454
}
5555

@@ -67,7 +67,7 @@ func (c *Collector) Export(ctx context.Context) (*uniast.Repository, error) {
6767
if err != nil {
6868
return nil, err
6969
}
70-
repo.Modules[name] = newModule(name, rel)
70+
repo.Modules[name] = newModule(name, rel, c.Language)
7171
}
7272

7373
// not allow local symbols inside another symbol
@@ -83,11 +83,13 @@ func (c *Collector) Export(ctx context.Context) (*uniast.Repository, error) {
8383
}
8484

8585
// patch module
86-
for p, m := range repo.Modules {
87-
if p == "" || strings.Contains(p, "@") {
88-
continue
86+
if c.modPatcher != nil {
87+
for p, m := range repo.Modules {
88+
if p == "" || strings.Contains(p, "@") {
89+
continue
90+
}
91+
c.modPatcher.Patch(m)
8992
}
90-
c.modPatcher.Patch(m)
9193
}
9294

9395
return &repo, nil
@@ -140,7 +142,7 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol
140142
}
141143

142144
if repo.Modules[mod] == nil {
143-
repo.Modules[mod] = newModule(mod, "")
145+
repo.Modules[mod] = newModule(mod, "", c.Language)
144146
}
145147
module := repo.Modules[mod]
146148
if repo.Modules[mod].Packages[path] == nil {
@@ -284,7 +286,7 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol
284286
obj.GlobalVars = make([]uniast.Dependency, 0, len(deps))
285287
}
286288
obj.GlobalVars = uniast.InsertDependency(obj.GlobalVars, pdep)
287-
case lsp.SKStruct, lsp.SKTypeParameter, lsp.SKInterface, lsp.SKEnum:
289+
case lsp.SKStruct, lsp.SKTypeParameter, lsp.SKInterface, lsp.SKEnum, lsp.SKClass:
288290
if obj.Types == nil {
289291
obj.Types = make([]uniast.Dependency, 0, len(deps))
290292
}
@@ -298,7 +300,7 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol
298300
pkg.Functions[id.Name] = obj
299301

300302
// Type
301-
case lsp.SKStruct, lsp.SKTypeParameter, lsp.SKInterface, lsp.SKEnum:
303+
case lsp.SKStruct, lsp.SKTypeParameter, lsp.SKInterface, lsp.SKEnum, lsp.SKClass:
302304
obj := &uniast.Type{
303305
FileLine: fileLine,
304306
Content: content,
@@ -315,7 +317,7 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol
315317
continue
316318
}
317319
switch dep.Symbol.Kind {
318-
case lsp.SKStruct, lsp.SKTypeParameter, lsp.SKInterface, lsp.SKEnum:
320+
case lsp.SKStruct, lsp.SKTypeParameter, lsp.SKInterface, lsp.SKEnum, lsp.SKClass:
319321
obj.SubStruct = append(obj.SubStruct, uniast.NewDependency(*depid, c.fileLine(dep.Location)))
320322
default:
321323
log.Error("dep symbol %s not collected for \n", dep.Symbol, id)
@@ -368,6 +370,9 @@ func mapKind(kind lsp.SymbolKind) uniast.TypeKind {
368370
switch kind {
369371
case lsp.SKStruct:
370372
return "struct"
373+
// XXX: C++ should use class instead of struct
374+
case lsp.SKClass:
375+
return "struct"
371376
case lsp.SKTypeParameter:
372377
return "type-parameter"
373378
case lsp.SKInterface:

lang/cxx/lib.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright 2025 CloudWeGo Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cxx
16+
17+
import (
18+
"time"
19+
20+
"github.com/cloudwego/abcoder/lang/uniast"
21+
"github.com/cloudwego/abcoder/lang/utils"
22+
)
23+
24+
const MaxWaitDuration = 5 * time.Minute
25+
26+
func GetDefaultLSP() (lang uniast.Language, name string) {
27+
return uniast.Cxx, "clangd-18"
28+
}
29+
30+
func CheckRepo(repo string) (string, time.Duration) {
31+
openfile := ""
32+
// TODO: check if the project compiles.
33+
34+
// NOTICE: wait for Rust projects based on code files
35+
_, size := utils.CountFiles(repo, ".c", "build/")
36+
wait := 2*time.Second + time.Second*time.Duration(size/1024)
37+
if wait > MaxWaitDuration {
38+
wait = MaxWaitDuration
39+
}
40+
return openfile, wait
41+
}

lang/cxx/spec.go

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// Copyright 2025 CloudWeGo Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cxx
16+
17+
import (
18+
"fmt"
19+
"path/filepath"
20+
"strings"
21+
22+
lsp "github.com/cloudwego/abcoder/lang/lsp"
23+
"github.com/cloudwego/abcoder/lang/utils"
24+
)
25+
26+
type CxxSpec struct {
27+
repo string
28+
}
29+
30+
func NewCxxSpec() *CxxSpec {
31+
return &CxxSpec{}
32+
}
33+
34+
// XXX: maybe multi module support for C++?
35+
func (c *CxxSpec) WorkSpace(root string) (map[string]string, error) {
36+
c.repo = root
37+
rets := map[string]string{}
38+
absPath, err := filepath.Abs(root)
39+
if err != nil {
40+
return nil, fmt.Errorf("failed to get absolute path: %w", err)
41+
}
42+
rets["current"] = absPath
43+
return rets, nil
44+
}
45+
46+
// returns: modname, pathpath, error
47+
// Multiple symbols with the same name could occur (for example in the Linux kernel).
48+
// The identify is mod::pkg::name. So we use the pkg (the file name) to distinguish them.
49+
func (c *CxxSpec) NameSpace(path string) (string, string, error) {
50+
// external lib: only standard library (system headers), in /usr/
51+
if !strings.HasPrefix(path, c.repo) {
52+
if strings.HasPrefix(path, "/usr") {
53+
// assume it is c system library
54+
return "cstdlib", "cstdlib", nil
55+
}
56+
panic(fmt.Sprintf("external lib: %s\n", path))
57+
}
58+
59+
relpath, _ := filepath.Rel(c.repo, path)
60+
return "current", relpath, nil
61+
}
62+
63+
func (c *CxxSpec) ShouldSkip(path string) bool {
64+
if strings.HasSuffix(path, ".c") || strings.HasSuffix(path, ".h") {
65+
return false
66+
}
67+
return true
68+
}
69+
70+
func (c *CxxSpec) IsDocToken(tok lsp.Token) bool {
71+
return tok.Type == "comment"
72+
}
73+
74+
func (c *CxxSpec) DeclareTokenOfSymbol(sym lsp.DocumentSymbol) int {
75+
for i, t := range sym.Tokens {
76+
if c.IsDocToken(t) {
77+
continue
78+
}
79+
for _, m := range t.Modifiers {
80+
if m == "declaration" {
81+
return i
82+
}
83+
}
84+
}
85+
return -1
86+
}
87+
88+
func (c *CxxSpec) IsEntityToken(tok lsp.Token) bool {
89+
return tok.Type == "class" || tok.Type == "function" || tok.Type == "variable"
90+
}
91+
92+
func (c *CxxSpec) IsStdToken(tok lsp.Token) bool {
93+
panic("TODO")
94+
}
95+
96+
func (c *CxxSpec) TokenKind(tok lsp.Token) lsp.SymbolKind {
97+
switch tok.Type {
98+
case "class":
99+
return lsp.SKStruct
100+
case "enum":
101+
return lsp.SKEnum
102+
case "enumMember":
103+
return lsp.SKEnumMember
104+
case "function", "macro":
105+
return lsp.SKFunction
106+
// rust spec does not treat parameter as a variable
107+
case "parameter":
108+
return lsp.SKVariable
109+
case "typeParameter":
110+
return lsp.SKTypeParameter
111+
// type: TODO
112+
case "interface", "concept", "method", "modifier", "namespace", "type":
113+
panic(fmt.Sprintf("Unsupported token type: %s at %+v\n", tok.Type, tok.Location))
114+
case "bracket", "comment", "label", "operator", "property", "unknown":
115+
return lsp.SKUnknown
116+
}
117+
panic(fmt.Sprintf("Weird token type: %s at %+v\n", tok.Type, tok.Location))
118+
}
119+
120+
func (c *CxxSpec) IsMainFunction(sym lsp.DocumentSymbol) bool {
121+
return sym.Kind == lsp.SKFunction && sym.Name == "main"
122+
}
123+
124+
func (c *CxxSpec) IsEntitySymbol(sym lsp.DocumentSymbol) bool {
125+
typ := sym.Kind
126+
return typ == lsp.SKFunction || typ == lsp.SKVariable || typ == lsp.SKClass
127+
128+
}
129+
130+
func (c *CxxSpec) IsPublicSymbol(sym lsp.DocumentSymbol) bool {
131+
id := c.DeclareTokenOfSymbol(sym)
132+
if id == -1 {
133+
return false
134+
}
135+
for _, m := range sym.Tokens[id].Modifiers {
136+
if m == "globalScope" {
137+
return true
138+
}
139+
}
140+
return false
141+
}
142+
143+
// TODO(cpp): support C++ OOP
144+
func (c *CxxSpec) HasImplSymbol() bool {
145+
return false
146+
}
147+
148+
func (c *CxxSpec) ImplSymbol(sym lsp.DocumentSymbol) (int, int, int) {
149+
panic("TODO")
150+
}
151+
152+
func (c *CxxSpec) FunctionSymbol(sym lsp.DocumentSymbol) (int, []int, []int, []int) {
153+
// No receiver and no type params for C
154+
if sym.Kind != lsp.SKFunction {
155+
return -1, nil, nil, nil
156+
}
157+
receiver := -1
158+
typeParams := []int{}
159+
inputParams := []int{}
160+
outputs := []int{}
161+
162+
// general format: RETURNVALUE NAME "(" PARAMS ")" BODY
163+
// --------
164+
// fnNameText
165+
// state machine phase 0 phase 1 phase 2: break
166+
// TODO: attributes may contain parens. also inline structs.
167+
168+
endRelOffset := 0
169+
lines := utils.CountLinesCached(sym.Text)
170+
phase := 0
171+
for i, tok := range sym.Tokens {
172+
switch phase {
173+
case 0:
174+
if tok.Type == "function" {
175+
offset := lsp.RelativePostionWithLines(*lines, sym.Location.Range.Start, tok.Location.Range.Start)
176+
endRelOffset = offset + strings.Index(sym.Text[offset:], ")")
177+
phase = 1
178+
continue
179+
}
180+
if c.IsEntityToken(tok) {
181+
outputs = append(outputs, i)
182+
}
183+
case 1:
184+
offset := lsp.RelativePostionWithLines(*lines, sym.Location.Range.Start, tok.Location.Range.Start)
185+
if offset > endRelOffset {
186+
phase = 2
187+
continue
188+
}
189+
if c.IsEntityToken(tok) {
190+
inputParams = append(inputParams, i)
191+
}
192+
}
193+
}
194+
return receiver, typeParams, inputParams, outputs
195+
}
196+
197+
func (c *CxxSpec) GetUnloadedSymbol(from lsp.Token, define lsp.Location) (string, error) {
198+
panic("TODO")
199+
}

0 commit comments

Comments
 (0)