diff --git a/docker/lib/identify-vendored.sh b/docker/lib/identify-vendored.sh index 145352e..c83c2bb 100644 --- a/docker/lib/identify-vendored.sh +++ b/docker/lib/identify-vendored.sh @@ -119,37 +119,59 @@ fi # --argjson: a large source tree yields thousands of matches, and a multi-hundred-KB # --argjson string overflows Linux's per-argument limit (MAX_ARG_STRLEN, 128 KB), # which would silently produce an empty/invalid SBOM on the (Linux) scanner image. +# Coverage filter (precision over noise). The free OSSKB often matches a +# widely-copied file to a downstream project that vendored the library rather than +# the canonical upstream, so a real source tree produces many one-off matches to +# unrelated forks. Group file matches by library NAME and promote a component only +# when at least SCANOSS_MIN_FILES files agree on it; single-file fork noise is +# dropped. Within a kept group the version and PURL are the consensus (most common) +# value, which also fixes per-file version disagreement. Set SCANOSS_MIN_FILES=1 to +# disable the filter (keep every single-file match). +MIN_FILES="${SCANOSS_MIN_FILES:-2}" +case "$MIN_FILES" in ''|*[!0-9]*) MIN_FILES=2 ;; esac COMPS_FILE="$WORK/comps.json" -jq -c ' +jq -c --argjson minfiles "$MIN_FILES" ' [ to_entries[] | .key as $file | .value[]? | select((.id // "") == "file") - | { - type: "library", - name: (.component // ((.purl[0] // "") | sub("^pkg:[^/]+/"; ""))), + | { name: (.component // ((.purl[0] // "") | sub("^pkg:[^/]+/"; ""))), version: ( (.component // "") as $c | (.version // "") | ltrimstr($c + "-") | ltrimstr($c + "_") | sub("^[vV](?=[0-9])"; "") ), purl: (.purl[0] // null), cpe: (.cpe[0]? // null), - licenses: ( [ .licenses[]?.name // empty ] - | map(select(. != null and . != "")) | unique - | map({ license: { name: . } }) ), - properties: ( [ - { name: "bomlens:layer", value: "vendored" }, - { name: "bomlens:identifiedBy", value: "scanoss" }, - { name: "bomlens:scanoss:match", value: (.matched // "") }, - { name: "bomlens:scanoss:file", value: $file }, - { name: "bomlens:scanoss:purl", value: (.purl[0] // "") } - ] | map(select((.value // "") != "")) ) - } - | with_entries(select(.value != null and .value != "" and .value != [])) + matched: (.matched // ""), + licenses: [ .licenses[]?.name // empty ] } | select((.name // "") != "") ] - | group_by(.purl // ((.name // "") + "@" + (.version // ""))) - | map(.[0]) + | group_by(.name | ascii_downcase) + | map( + . as $g + | ($g | length) as $files + | select($files >= $minfiles) + | ($g | map(.version) | map(select(. != ""))) as $vers + | ($g | map(.purl) | map(select(. != null))) as $purls + | (if ($purls | length) > 0 then ($purls | group_by(.) | max_by(length) | .[0]) else null end) as $purl + | (if ($vers | length) > 0 then ($vers | group_by(.) | max_by(length) | .[0]) else "" end) as $ver + | { type: "library", + name: ($g[0].name), + version: $ver, + purl: $purl, + cpe: ($g | map(.cpe) | map(select(. != null)) | (.[0] // null)), + licenses: ( [ $g[].licenses[]? ] | map(select(. != null and . != "")) + | unique | map({ license: { name: . } }) ), + properties: ( [ + { name: "bomlens:layer", value: "vendored" }, + { name: "bomlens:identifiedBy", value: "scanoss" }, + { name: "bomlens:scanoss:files", value: ($files | tostring) }, + { name: "bomlens:scanoss:match", value: ($g[0].matched) }, + { name: "bomlens:scanoss:purl", value: ($purl // "") } + ] | map(select((.value // "") != "")) ) + } + | with_entries(select(.value != null and .value != "" and .value != [])) + ) | sort_by(.purl // ((.name // "") + "@" + (.version // ""))) ' "$RAW" > "$COMPS_FILE" 2>/dev/null || true # Guard: ensure a valid JSON array even if the transform failed. diff --git a/docs/guides/identify-vendored.ko.md b/docs/guides/identify-vendored.ko.md index 52240f9..833c7bc 100644 --- a/docs/guides/identify-vendored.ko.md +++ b/docs/guides/identify-vendored.ko.md @@ -67,6 +67,6 @@ scan-sbom.sh --project trelay --version 26.4.0 --target ./src --identify-vendore 버전은 근사값입니다. 파일 매치는 그 파일 내용이 처음 등장한 릴리스를 버전으로 보고하므로, 같은 라이브러리라도 파일마다 버전이 조금씩 다르게 나오거나 실제보다 한 단계 어긋난 릴리스로 보고될 수 있습니다. 버전(과 그로부터 도출된 CVE)은 최종 판정이 아니라 검토의 출발점으로 삼으세요. -귀속(어느 프로젝트인지)도 틀릴 수 있습니다. 여러 프로젝트가 흔히 복사하는 파일(예: zlib의 `deflate.c`)은 정식 upstream이 아니라 그것을 vendored한 다운스트림 프로젝트로 매치될 수 있어, 실제 zlib 사본이 다른 이름으로 보고되고 그 CVE를 놓칠 수 있습니다. 이는 지식 베이스의 랭킹·커버리지 한계이며 무료 OSSKB에서 더 두드러집니다. 더 정확한 귀속이 필요하면 `SCANOSS_API_URL`을 SCANOSS 상용·자체 호스팅 엔드포인트로 지정하세요. 또한 공개 저장소에 이미 게시된 소스를 스캔하면 그 저장소로 매치됩니다(자기 1st-party 파일이 자기 공개 프로젝트로 매치) — 의도한 용도인 비공개 공급사 소스에서는 발생하지 않습니다. +귀속(어느 프로젝트인지)도 틀릴 수 있습니다. 여러 프로젝트가 흔히 복사하는 파일(예: zlib의 `deflate.c`)은 정식 upstream이 아니라 그것을 vendored한 다운스트림 프로젝트로 매치될 수 있습니다. 이 노이즈를 줄이기 위해 BomLens는 **최소 두 개 이상의 파일이 지지하는 라이브러리만 보고**하고(`SCANOSS_MIN_FILES`로 조정, `1`이면 모두 유지) 버전·PURL은 그 파일들의 **다수결**로 정합화합니다. 그래서 단발성 포크 매치는 떨어지고, 여러 포크로 흩어진 라이브러리는 하나의 컴포넌트로 합쳐집니다. 다만 완전한 해결은 아니며, 실제 사본이 여전히 다른 이름으로 보고되고 그 CVE를 놓칠 수 있습니다. 이는 지식 베이스의 랭킹·커버리지 한계이며 무료 OSSKB에서 더 두드러집니다. 더 정확한 귀속이 필요하면 `SCANOSS_API_URL`을 SCANOSS 상용·자체 호스팅 엔드포인트로 지정하세요. 또한 공개 저장소에 이미 게시된 소스를 스캔하면 그 저장소로 매치됩니다(자기 1st-party 파일이 자기 공개 프로젝트로 매치) — 의도한 용도인 비공개 공급사 소스에서는 발생하지 않습니다. 결과는 사람 검토가 도움이 되는 best-effort 추정입니다. OSSKB 약관과 라이선스 설명은 [THIRD_PARTY_LICENSES.md](https://github.com/sktelecom/sbom-tools/blob/main/THIRD_PARTY_LICENSES.md)를 참조하세요. diff --git a/docs/guides/identify-vendored.md b/docs/guides/identify-vendored.md index 3615303..acf6ada 100644 --- a/docs/guides/identify-vendored.md +++ b/docs/guides/identify-vendored.md @@ -67,6 +67,6 @@ scan-sbom.sh --project trelay --version 26.4.0 --target ./src --identify-vendore Version precision is approximate. A file match reports the release where that file content first appeared, so different files of the same library can resolve to slightly different versions and a copied-in library may be reported a point release off. Treat the version (and any CVEs derived from it) as a starting point for review, not a final verdict. -Attribution can also point at the wrong project. A file that many projects copy (for example zlib's `deflate.c`) may match a downstream project that vendored it rather than the canonical upstream — so a real zlib copy can be reported under another name, and its CVEs missed. This is a ranking and coverage limit of the knowledge base; it is more pronounced on the free OSSKB. For higher-fidelity attribution, point `SCANOSS_API_URL` at a SCANOSS commercial or self-hosted endpoint. Relatedly, scanning source that is itself published in a public repository will match that repository (your own first-party files can match your own public project) — this does not occur for the intended case of private supplier source. +Attribution can also point at the wrong project. A file that many projects copy (for example zlib's `deflate.c`) may match a downstream project that vendored it rather than the canonical upstream. To cut that noise, BomLens reports a library only when at least two files agree on it (configurable with `SCANOSS_MIN_FILES`; set `1` to keep every match) and resolves the version and PURL from the consensus across those files, so scattered one-off fork matches are dropped and a library split across forks collapses to a single component. This helps but does not fully fix it — a real copy can still be reported under another name, and its CVEs missed. It is a ranking and coverage limit of the knowledge base, more pronounced on the free OSSKB; for higher-fidelity attribution, point `SCANOSS_API_URL` at a SCANOSS commercial or self-hosted endpoint. Relatedly, scanning source that is itself published in a public repository will match that repository (your own first-party files can match your own public project) — this does not occur for the intended case of private supplier source. Results are a best-effort estimate that benefits from human review. See the OSSKB terms and license notes in [THIRD_PARTY_LICENSES.md](https://github.com/sktelecom/sbom-tools/blob/main/THIRD_PARTY_LICENSES.md). diff --git a/docs/reference/cli.ko.md b/docs/reference/cli.ko.md index e775653..d6d6b81 100644 --- a/docs/reference/cli.ko.md +++ b/docs/reference/cli.ko.md @@ -50,6 +50,7 @@ BomLens의 전체 옵션과 분석 모드, CI/CD 통합 방법, 트러블슈팅 | `SBOM_FIRMWARE_IMAGE` | `ghcr.io/sktelecom/bomlens-firmware:latest` | 펌웨어 분석용 이미지 지정 | | `SCANOSS_API_URL` | OSSKB 무료 API | `--identify-vendored`의 엔드포인트. 에어갭·대량 사용 시 SCANOSS 상용·자체 호스팅 엔드포인트로 지정 | | `SCANOSS_API_KEY` | — | `SCANOSS_API_URL`이 요구하는 경우의 자격 증명 | +| `SCANOSS_MIN_FILES` | `2` | 라이브러리를 보고하기 위해 매치돼야 하는 최소 파일 수. 단발성 다운스트림 포크 노이즈를 거른다. `1`로 두면 단일 파일 매치도 모두 유지 | | `GIT_TOKEN` | — | 비공개 git 저장소 클론에 쓰는 토큰 | | `COSIGN_KEY` | — | `--sign`에 쓰는 서명 키 경로 | | `FETCH_LICENSE` | `true` | 소스 스캔 시 의존성 라이선스를 자동 조회. `false`면 조회를 생략해 속도를 높임 | diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 04b9154..ec88378 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -50,6 +50,7 @@ Environment variables adjust the behavior. | `SBOM_FIRMWARE_IMAGE` | `ghcr.io/sktelecom/bomlens-firmware:latest` | Image used for firmware analysis | | `SCANOSS_API_URL` | OSSKB free API | Endpoint for `--identify-vendored`. Point at a SCANOSS commercial or self-hosted endpoint for air-gapped or high-volume use | | `SCANOSS_API_KEY` | — | Credential for `SCANOSS_API_URL`, if the endpoint requires one | +| `SCANOSS_MIN_FILES` | `2` | Minimum number of files that must match a library before it is reported, to drop one-off downstream-fork noise. Set `1` to keep every single-file match | | `GIT_TOKEN` | — | Token for cloning private git repositories | | `COSIGN_KEY` | — | Path to the signing key used by `--sign` | | `FETCH_LICENSE` | `true` | Resolve dependency licenses during source scans. Set `false` to skip the lookup and run faster | diff --git a/tests/fixtures/scanoss-raw-managed.json b/tests/fixtures/scanoss-raw-managed.json index 1aafb24..3b61c73 100644 --- a/tests/fixtures/scanoss-raw-managed.json +++ b/tests/fixtures/scanoss-raw-managed.json @@ -10,6 +10,17 @@ "matched": "100%" } ], + "node_modules_copy/lodash/core.js": [ + { + "id": "file", + "component": "lodash", + "vendor": "lodash", + "version": "4.17.21", + "purl": ["pkg:github/lodash/lodash"], + "licenses": [{ "name": "MIT" }], + "matched": "100%" + } + ], "src/liblfds/lfds.c": [ { "id": "file", @@ -20,5 +31,16 @@ "licenses": [{ "name": "Unlicense" }], "matched": "100%" } + ], + "src/liblfds/btree.c": [ + { + "id": "file", + "component": "liblfds", + "vendor": "liblfds", + "version": "6.1.1", + "purl": ["pkg:github/liblfds/liblfds"], + "licenses": [{ "name": "Unlicense" }], + "matched": "100%" + } ] } diff --git a/tests/fixtures/scanoss-raw.json b/tests/fixtures/scanoss-raw.json index b563b5d..563c874 100644 --- a/tests/fixtures/scanoss-raw.json +++ b/tests/fixtures/scanoss-raw.json @@ -10,6 +10,17 @@ "matched": "100%" } ], + "src/openssl/aes_core.c": [ + { + "id": "file", + "component": "openssl", + "vendor": "openssl", + "version": "openssl-3.0.0", + "purl": ["pkg:github/openssl/openssl"], + "licenses": [{ "name": "Apache-2.0" }], + "matched": "100%" + } + ], "src/liblfds/lfds.c": [ { "id": "file", @@ -21,6 +32,17 @@ "matched": "100%" } ], + "src/liblfds/btree.c": [ + { + "id": "file", + "component": "liblfds", + "vendor": "liblfds", + "version": "6.1.1", + "purl": ["pkg:github/liblfds/liblfds"], + "licenses": [{ "name": "Unlicense" }], + "matched": "100%" + } + ], "src/util/helpers.c": [ { "id": "snippet", diff --git a/tests/test-vendored-adversarial.sh b/tests/test-vendored-adversarial.sh index 199db91..2e1b11a 100644 --- a/tests/test-vendored-adversarial.sh +++ b/tests/test-vendored-adversarial.sh @@ -34,10 +34,13 @@ printf '%s\n' '#!/bin/bash' \ chmod +x "$WORK/bin/scanoss-py" # identify — run identify-vendored.sh with the mock. +# SCANOSS_MIN_FILES=1 keeps single-file matches so these cases (which probe CPE +# grammar, version forms, injection, volume — not coverage) behave 1:1 with the +# crafted raw. The coverage filter itself is exercised in its own section below. identify() { local raw="$1" out="$2" src="$WORK/src" rm -rf "$src"; mkdir -p "$src"; echo 'int main(void){return 0;}' > "$src/m.c" - SCANOSS_RAW_FIXTURE="$raw" PATH="$WORK/bin:$PATH" \ + SCANOSS_RAW_FIXTURE="$raw" SCANOSS_MIN_FILES=1 PATH="$WORK/bin:$PATH" \ bash "$LIB/identify-vendored.sh" "$src" "$out" "1.0" >/dev/null 2>&1 } # valid_cdx — true if it parses and components is an array. @@ -179,6 +182,41 @@ bash "$LIB/normalize-sbom.sh" "$WORK/s1.json" --stable >/dev/null 2>&1 bash "$LIB/normalize-sbom.sh" "$WORK/s2.json" --stable >/dev/null 2>&1 if diff -q "$WORK/s1.json" "$WORK/s2.json" >/dev/null 2>&1; then pass "two identical scans are byte-identical after --stable"; else fail "byte-stable diff" "$(diff "$WORK/s1.json" "$WORK/s2.json" | head)"; fi +echo "== coverage filter: single-file fork noise dropped, real lib consolidated ==" +# Models the real-OSSKB failure mode: a widely-copied library (openssl) matches +# scattered downstream forks (same NAME, differing purl/version), plus one-off +# single-file matches to unrelated projects. The default SCANOSS_MIN_FILES=2 +# should drop the one-off noise and consolidate openssl to one component with the +# CONSENSUS version/purl (not the minority wrong one). +cat > "$WORK/cov.json" <<'COV' +{ + "f1.c":[{"id":"file","component":"openssl","version":"3.0.0","purl":["pkg:github/openssl/openssl"],"matched":"100%"}], + "f2.c":[{"id":"file","component":"openssl","version":"3.0.0","purl":["pkg:github/openssl/openssl"],"matched":"100%"}], + "f3.c":[{"id":"file","component":"openssl","version":"3.0.0","purl":["pkg:github/openssl/openssl"],"matched":"100%"}], + "f4.c":[{"id":"file","component":"openssl","version":"OpenSSL_0_9_1c","purl":["pkg:github/bilibili/openssl"],"matched":"100%"}], + "g1.c":[{"id":"file","component":"globus-toolkit","version":"1.0","purl":["pkg:github/globus/globus-toolkit"],"matched":"100%"}], + "h1.c":[{"id":"file","component":"halite","version":"0.4","purl":["pkg:github/x/halite"],"matched":"100%"}], + "l1.c":[{"id":"file","component":"liblfds","version":"6.1.1","purl":["pkg:github/liblfds/liblfds"],"matched":"100%"}], + "l2.c":[{"id":"file","component":"liblfds","version":"6.1.1","purl":["pkg:github/liblfds/liblfds"],"matched":"100%"}] +} +COV +mkdir -p "$WORK/srctree"; echo 'int x;' > "$WORK/srctree/a.c" +SCANOSS_RAW_FIXTURE="$WORK/cov.json" PATH="$WORK/bin:$PATH" \ + bash "$LIB/identify-vendored.sh" "$WORK/srctree" "$WORK/cov-out.json" "1.0" >/dev/null 2>&1 +names=$(jq -r '[.components[].name] | sort | join(",")' "$WORK/cov-out.json" 2>/dev/null) +[ "$names" = "liblfds,openssl" ] && pass "default threshold keeps only multi-file libs (openssl, liblfds); fork noise dropped" || fail "coverage filter result='$names', expected liblfds,openssl" +ov=$(jq -r '.components[]|select(.name=="openssl")|.version' "$WORK/cov-out.json" 2>/dev/null) +op=$(jq -r '.components[]|select(.name=="openssl")|.purl' "$WORK/cov-out.json" 2>/dev/null) +[ "$ov" = "3.0.0" ] && pass "openssl resolves to the consensus version (3.0.0, not the minority 0.9.1c)" || fail "openssl version='$ov', expected 3.0.0" +[ "$op" = "pkg:github/openssl/openssl" ] && pass "openssl resolves to the consensus PURL (canonical, not the fork)" || fail "openssl purl='$op'" +ofiles=$(jq -r '.components[]|select(.name=="openssl")|.properties[]|select(.name=="bomlens:scanoss:files")|.value' "$WORK/cov-out.json" 2>/dev/null) +[ "$ofiles" = "4" ] && pass "openssl carries its file-support count (bomlens:scanoss:files=4)" || fail "openssl files='$ofiles', expected 4" +# Disabling the filter keeps every single-file match (back-compat escape hatch). +SCANOSS_RAW_FIXTURE="$WORK/cov.json" SCANOSS_MIN_FILES=1 PATH="$WORK/bin:$PATH" \ + bash "$LIB/identify-vendored.sh" "$WORK/srctree" "$WORK/cov1.json" "1.0" >/dev/null 2>&1 +n1=$(jq '[.components[]?]|length' "$WORK/cov1.json" 2>/dev/null) +[ "$n1" = "4" ] && pass "SCANOSS_MIN_FILES=1 disables the filter (all 4 names kept)" || fail "min_files=1 produced $n1 components, expected 4" + echo "" echo "Results: ${PASS} passed, ${FAIL} failed" [ "$FAIL" -eq 0 ]