MethodAtlas/.github/workflows/methodatlas-analysis.yml at main · Accenture/MethodAtlas · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# MethodAtlas AI self-analysis — reusable workflow
#
# Classifies this project's own JUnit test methods for security relevance using
# MethodAtlas and a free AI provider (GitHub Models).  The resulting SARIF is
# uploaded to GitHub Code Scanning so security-relevant test methods appear in
# the Security tab and as inline annotations on pull request diffs.
#
# This workflow is called from pages.yml on every push to main.  Failures are
# treated as informational by the caller and do not block the Pages deployment.
#
# ═══════════════════════════════════════════════════════════════════════════════
# ADAPTING THIS WORKFLOW FOR YOUR OWN PROJECT
# ───────────────────────────────────────────
# This file is designed to be copied and adapted.  There are two sections to
# change:
#
#  1. "Obtain MethodAtlas" — swap the build-from-source steps for a download.
#     See the comment block inside that section for a ready-made snippet.
#
#  2. "Run MethodAtlas" — change the source path to your test directory and
#     optionally switch the AI model or provider.
#
# Everything else (GITHUB_TOKEN auth, SARIF upload, artifact retention) works
# unchanged in any GitHub repository.
# ═══════════════════════════════════════════════════════════════════════════════
name: MethodAtlas self-analysis

on:
  workflow_call:
    inputs:
      test-source-path:
        description: 'Path to the directory containing JUnit test sources'
        required: false
        type: string
        default: 'src/test/java'
      ai-model:
        description: 'GitHub Models model identifier used for classification'
        required: false
        type: string
        default: 'gpt-4o-mini'
      override-file:
        description: >
          Path (relative to this repository root) of a local classification
          override YAML file. When supplied and the file exists on disk,
          -override-file is passed to MethodAtlas so that human-reviewed
          corrections are applied on top of the AI classification.

          ⚠ SIMPLIFIED APPROACH — suitable only for small teams where
          developers and the security reviewer share the same repository. In
          organisations with a dedicated security or risk team, use
          security-overrides-repo instead so that override decisions are owned
          exclusively by that team and cannot be changed by developers via a PR
          to their own repository. See docs/ai/remote-overrides.md for a full
          comparison of strategies.

          Ignored when security-overrides-repo is also set (the remote security
          repo always takes precedence). Leave empty (the default) to run
          without a local override file.
        required: false
        type: string
        default: ''
      security-overrides-repo:
        description: >
          GitHub repository (owner/name) that holds the canonical override file
          maintained by the security or risk team, for example
          acme-corp/security-overrides. When set, this workflow checks out that
          repository using SECURITY_OVERRIDES_TOKEN and reads the file at the
          path given by security-overrides-path.

          This input takes precedence over override-file when both are supplied.
          The caller must pass a SECURITY_OVERRIDES_TOKEN secret — a
          fine-grained PAT or GitHub App installation token with contents:read
          on the security repository. Store it as an organisation-level secret
          so that individual development teams do not need to manage it.

          Leave empty (the default) to skip the remote-repo checkout step.
        required: false
        type: string
        default: ''
      security-overrides-path:
        description: >
          Path within security-overrides-repo to the override YAML file,
          relative to the root of that repository. Only used when
          security-overrides-repo is non-empty. Defaults to
          methodatlas-overrides.yaml.
        required: false
        type: string
        default: 'methodatlas-overrides.yaml'
      security-overrides-ref:
        description: >
          Git ref (branch, tag, or full commit SHA) to check out from
          security-overrides-repo. Pin to a release tag for reproducible runs
          and to prevent an unreviewed commit from silently altering
          classifications. Leave empty to use the default branch.
        required: false
        type: string
        default: ''
    secrets:
      SECURITY_OVERRIDES_TOKEN:
        description: >
          Fine-grained PAT or GitHub App installation token with contents:read
          on security-overrides-repo. Required when security-overrides-repo is
          set; ignored (and not required) otherwise.
        required: false

# Permissions required by this workflow.
# The calling workflow must grant at least these two permissions.
permissions:
  contents: read
  security-events: write   # required to upload SARIF to GitHub Code Scanning
  models: read             # required to call GitHub Models inference API

jobs:
  analyze:
    name: MethodAtlas security classification
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0    # full history + tags required by com.palantir.git-version

      # -----------------------------------------------------------------------
      # Obtain MethodAtlas
      #
      # PROJECT MAINTAINERS: MethodAtlas is built from the current commit so
      # the analysis always reflects the version under review.  Gradle
      # dependency caching keeps this fast on repeat runs.
      #
      # ───────────────────────────────────────────────────────────────────────
      # ADAPTING FOR YOUR OWN PROJECT
      # Replace the three steps below with a single download step:
      #
      #   - name: Download MethodAtlas
      #     run: |
      #       # Resolve the latest release tag and strip the "release@" prefix
      #       VERSION=$(curl -fsSL https://api.github.com/repos/Accenture/MethodAtlas/releases/latest \
      #                 | jq -r '.tag_name | ltrimstr("release@")')
      #       curl -fsSL \
      #         "https://github.com/Accenture/MethodAtlas/releases/latest/download/methodatlas-${VERSION}.zip" \
      #         -o methodatlas.zip
      #       unzip -q methodatlas.zip
      #       echo "METHODATLAS=$(pwd)/methodatlas-${VERSION}/bin/methodatlas" >> "$GITHUB_ENV"
      # ───────────────────────────────────────────────────────────────────────
      - name: Set up JDK 21 (Temurin)
        uses: actions/setup-java@v4
        with:
          java-version: '21'
          distribution: 'temurin'

      - name: Setup Gradle
        uses: gradle/actions/setup-gradle@v4

      - name: Set up Node.js (required by TypeScript discovery plugin)
        uses: actions/setup-node@v4
        with:
          node-version: 'lts/*'

      - name: Build MethodAtlas from source
        run: |
          ./gradlew installDist
          echo "METHODATLAS=$(pwd)/build/install/methodatlas/bin/methodatlas" >> "$GITHUB_ENV"

      # -----------------------------------------------------------------------
      # Fetch security team override file (optional — enterprise path)
      #
      # Runs only when security-overrides-repo is supplied.  Performs a
      # shallow, sparse checkout of the security team's repository into
      # .security-overrides/ using the SECURITY_OVERRIDES_TOKEN secret (a
      # fine-grained PAT or GitHub App token with contents:read on that repo).
      #
      # sparse-checkout in non-cone mode treats the pattern as a path glob,
      # so only the single override file is transferred — not the entire repo.
      #
      # When this step is skipped, the Run MethodAtlas step falls back to the
      # local override-file input (if set) or runs without any override file.
      # -----------------------------------------------------------------------
      - name: Fetch security team override file
        if: ${{ inputs.security-overrides-repo != '' }}
        uses: actions/checkout@v4
        with:
          repository: ${{ inputs.security-overrides-repo }}
          path: .security-overrides
          token: ${{ secrets.SECURITY_OVERRIDES_TOKEN }}
          sparse-checkout: ${{ inputs.security-overrides-path }}
          sparse-checkout-cone-mode: false
          ref: ${{ inputs.security-overrides-ref }}
          fetch-depth: 1

      # -----------------------------------------------------------------------
      # Restore AI classification cache from a previous run
      #
      # MethodAtlas computes a SHA-256 content fingerprint (content_hash) for
      # each test class and reuses stored AI classifications for any class whose
      # hash matches an entry in the cache file.  Only classes that have changed
      # since the last run incur an AI provider call.
      #
      # Cache key strategy:
      #   Primary   — branch + commit SHA (unique per run; ensures the newest
      #                cache entry for the branch is always saved)
      #   Fallback  — branch prefix only  (picks the most recent run on the
      #                same branch when the exact commit key is absent)
      #   Fallback  — bare prefix          (cross-branch fallback; allows PRs to
      #                warm-start from a recent main-branch cache)
      #
      # On the very first run (cold cache) no file is restored; every class is
      # sent to the AI provider.  All subsequent runs on the same branch benefit
      # from incremental classification.
      # -----------------------------------------------------------------------
      - name: Restore AI classification cache
        id: restore-ai-cache
        uses: actions/cache/restore@v4
        with:
          path: build/methodatlas-cache.json.gz
          key: methodatlas-ai-cache-${{ github.ref_name }}-${{ github.sha }}
          restore-keys: |
            methodatlas-ai-cache-${{ github.ref_name }}-
            methodatlas-ai-cache-

      # -----------------------------------------------------------------------
      # Run MethodAtlas — classification + credential detection (single pass)
      #
      # Uses GitHub Models (provider: github_models) authenticated with
      # GITHUB_TOKEN — automatically available in every GitHub Actions run,
      # no additional secrets or billing setup required.
      #
      # A SINGLE combined pass classifies test methods AND triages credentials in
      # one prompt per class. The unified AI cache (-ai-cache / -ai-cache-out)
      # carries BOTH the classifications and the credential verdicts, keyed by the
      # class content hash and tagged with the prompt-catalogue signature, so an
      # unchanged class is served from cache with zero AI calls and the SARIF still
      # reports its credentials. -ai-cache-out persists the cache regardless of the
      # output format, so the old CSV-then-SARIF two-pass workaround is gone.
      #
      # Flags:
      #   -ai / -ai-provider / -ai-model / -ai-api-key-env  AI provider selection
      #   -ai-taxonomy-mode optimized / -ai-confidence       classification tuning
      #   -content-hash   per-class fingerprint used as the cache key
      #   -drift-detect   flag @Tag("security") / AI disagreements in the SARIF
      #   -detect-secrets deterministic credential detection + folded AI triage
      #   -secrets-out    credential CSV (values masked by default)
      #   -ai-cache / -ai-cache-out   read and write the unified cache
      #   -override-file  (optional) apply human-reviewed corrections
      #   -sarif          one SARIF with classifications AND credential findings
      #
      # ───────────────────────────────────────────────────────────────────────
      # ADAPTING FOR YOUR OWN PROJECT
      # Change the last argument to the path of your test source directory.
      # To use a different model, set the ai-model input when calling this
      # workflow, or edit the default value in the on.workflow_call.inputs
      # section at the top of this file.
      # To use a different AI provider (Groq, Mistral, OpenAI, …), replace
      # -ai-provider and -ai-api-key-env — see docs/ai/providers.md for the
      # full list of supported providers and their authentication requirements.
      # To supply an override file, set the override-file input to the path of
      # your YAML override file relative to the repository root, for example:
      #   override-file: .methodatlas-overrides.yaml
      # ───────────────────────────────────────────────────────────────────────
      - name: Run MethodAtlas AI classification
        run: |
          mkdir -p build/reports

          # Resolve -override-file argument using two-tier priority:
          #
          # 1. ENTERPRISE PATH — security-overrides-repo was set and the
          #    previous step checked out the file into .security-overrides/.
          #    Override decisions are owned exclusively by the security team.
          #
          # 2. SIMPLIFIED PATH — a local override-file was supplied.
          #    ⚠ Suitable for small teams only; in a larger organisation the
          #    security team should be the sole authority (use input 1 above).
          #    See docs/ai/remote-overrides.md for the full strategy comparison.
          #
          # If neither condition is met the workflow continues without any
          # override file — this is the safe default and produces no error.
          OVERRIDE_ARGS=()
          SECURITY_OVERRIDE_FILE=".security-overrides/$SECURITY_PATH"
          if [ -n "$SECURITY_REPO" ] && [ -f "$SECURITY_OVERRIDE_FILE" ]; then
            OVERRIDE_ARGS=("-override-file" "$SECURITY_OVERRIDE_FILE")
          elif [ -n "$LOCAL_OVERRIDE_FILE" ] && [ -f "$LOCAL_OVERRIDE_FILE" ]; then
            OVERRIDE_ARGS=("-override-file" "$LOCAL_OVERRIDE_FILE")
          fi

          # Decompress the restored unified cache (gzip keeps the Actions cache
          # small). If none was restored the .gz file is absent and the run starts
          # cold — every class calls the provider.
          if [ -f build/methodatlas-cache.json.gz ]; then
            gunzip -k build/methodatlas-cache.json.gz
          fi

          # Report cache status so the CI log explains the expected AI traffic.
          CACHE_ARGS=()
          if [ -f build/methodatlas-cache.json ]; then
            CACHED=$(wc -l < build/methodatlas-cache.json | tr -d ' ')
            echo "Unified AI cache restored ($CACHED cached classes)."
            echo "Only classes whose source changed will be sent to the AI provider."
            CACHE_ARGS=("-ai-cache" "build/methodatlas-cache.json")
          else
            echo "No AI cache found. All test classes will be sent to the AI provider."
          fi

          # ── Single combined pass ─────────────────────────────────────────
          # Classifies methods AND triages credentials in one prompt per class.
          # Changed classes call the provider; unchanged classes are served from
          # the restored unified cache (classifications AND credential verdicts).
          echo "Running classification + credential detection …"
          "$METHODATLAS" \
            -ai \
            -ai-provider github_models \
            -ai-model "${{ inputs.ai-model }}" \
            -ai-api-key-env GITHUB_TOKEN \
            -ai-taxonomy-mode optimized \
            -ai-max-class-chars 20000 \
            -ai-max-retries 3 \
            -ai-confidence \
            -content-hash \
            -drift-detect \
            -detect-secrets \
            -secrets-out build/reports/methodatlas-credentials.csv \
            "${CACHE_ARGS[@]}" \
            -ai-cache-out build/methodatlas-cache.json \
            "${OVERRIDE_ARGS[@]}" \
            -sarif \
            "${{ inputs.test-source-path }}" \
            > build/reports/methodatlas-sarif.json

          # Compress the updated unified cache for the save step.
          gzip -c build/methodatlas-cache.json > build/methodatlas-cache.json.gz
          echo "Run complete: SARIF + credential CSV written; unified cache updated."
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          LOCAL_OVERRIDE_FILE: ${{ inputs.override-file }}
          SECURITY_REPO: ${{ inputs.security-overrides-repo }}
          SECURITY_PATH: ${{ inputs.security-overrides-path }}

      # -----------------------------------------------------------------------
      # Save the unified AI cache for the next run
      #
      # Stored under a key that includes the commit SHA; the restore-keys prefix
      # lets the next run on the same branch fall back to it. Saved only on
      # success (never a partial cache) and compressed to minimise storage.
      # -----------------------------------------------------------------------
      - name: Save AI cache
        if: success()
        uses: actions/cache/save@v4
        with:
          path: build/methodatlas-cache.json.gz
          key: methodatlas-ai-cache-${{ github.ref_name }}-${{ github.sha }}

      # -----------------------------------------------------------------------
      # Publish results
      #
      # SARIF is uploaded to GitHub Code Scanning so findings appear in
      # Security → Code scanning and as inline annotations on PR diffs.
      # The same file is also retained as a downloadable workflow artifact for
      # 30 days so it can be inspected outside of the Code Scanning UI.
      #
      # ADAPTING FOR YOUR OWN PROJECT: both steps work unchanged.  The only
      # requirement is that your calling workflow has security-events: write
      # permission (needed for the upload-sarif action).
      # -----------------------------------------------------------------------
      - name: Upload SARIF to GitHub Code Scanning
        uses: github/codeql-action/upload-sarif@v3
        with:
          sarif_file: build/reports/methodatlas-sarif.json
          category: methodatlas

      - name: Upload SARIF artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: methodatlas-sarif
          path: build/reports/methodatlas-sarif.json
          retention-days: 30

      # -----------------------------------------------------------------------
      # Publish the credential CSV
      #
      # Credential findings are already embedded in the SARIF uploaded above
      # (under secret/<rule-id> rules); the masked CSV is retained as a separate
      # downloadable artifact for triage.
      # -----------------------------------------------------------------------
      - name: Upload credential-detection CSV artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: methodatlas-credentials
          path: build/reports/methodatlas-credentials.csv
          retention-days: 30