diff --git a/PSEUDOCODE_DESCRIPTION.md b/PSEUDOCODE_DESCRIPTION.md index 5c8a018..160db63 100644 --- a/PSEUDOCODE_DESCRIPTION.md +++ b/PSEUDOCODE_DESCRIPTION.md @@ -1,14 +1,15 @@ ### The `auth_request` endpoint that Nginx talks to Nginx `proxy_pass`es to banjax, which makes a decision (Allow, Challenge, NginxBlock, or IptablesBlock) -based on the requested host and the client IP. In pseudocode, banjax's decision-making works like this: +based on the requested host, the client IP, and the client User-Agent. In pseudocode, banjax's +decision-making works like this: ```python if has_valid_password_cookie(): return access_granted() if password_protected_path[requested_host][requested_path]: - if password_protected_path_exceptions[requested_host]: + if password_protected_path_exceptions[requested_host][requested_path]: return access_granted() else: return send_or_validate_password_page() @@ -21,36 +22,59 @@ if decision == Challenge: if decision in [NginxBlock, IptablesBlock]: return access_denied() +decision = per_site_user_agent_decision_lists[requested_host][client_user_agent] +if decision == Allow: + return access_granted() +if decision == Challenge: + return send_or_validate_challenge() +if decision in [NginxBlock, IptablesBlock]: + return access_denied() + decision = global_decision_lists[client_ip] # [...] same as above +decision = global_user_agent_decision_lists[client_user_agent] +# [...] same as above + decision = expiring_decision_lists[client_ip] if decision == Allow: return access_granted() if decision == Challenge: - if sites_to_disable_baskerville[requested_host] and decision.from_baskerville): - # skip challenge, go to sitewide_sha_inv_list check below - return send_or_validate_challenge() + if per_site_sha_inv_path_exceptions[requested_host][requested_path]: + return access_granted() + if sites_to_disable_baskerville[requested_host] and decision.from_baskerville: + pass # skip challenge, fall through to sitewide_sha_inv_list check below + else: + return send_or_validate_challenge() if decision in [NginxBlock, IptablesBlock]: - return access_denied() + if sites_to_disable_baskerville[requested_host] and decision.from_baskerville: + pass # skip block, fall through to sitewide_sha_inv_list check below + else: + return access_denied() if sitewide_sha_inv_list[requested_host]: - # sharing exception path with password protected paths - if password_protected_path_exceptions[requested_host]: + fail_action = sitewide_sha_inv_list[requested_host].fail_action # Block or Allow + if password_protected_path_exceptions[requested_host][requested_path]: return access_granted() else: - return send_or_validate_challenge() + return send_or_validate_challenge(fail_action) # if nothing matched above return access_granted() ``` -The decision lists are populated from: - * the config file, which is read at startup and on SIGHUP [XXX todo]. See `per_site_decision_lists` +The IP-based decision lists are populated from: + * the config file, which is read at startup and reloaded on SIGHUP. See `per_site_decision_lists` and `global_decision_lists`. This is useful for allowlisting or blocklisting known good or bad IPs. * the regex-based rate-limiting rules explained in more detail below. * commands received over the Kafka connection. This is how Baskerville communicates with banjax. +The User-Agent decision lists (`per_site_user_agent_decision_lists` and `global_user_agent_decision_lists`) +are static, loaded from the config file only. Each entry is either a plain substring match or a regex +pattern (detected automatically by the presence of regex metacharacters). Patterns are pre-compiled at +config load time. Decision severity order is IptablesBlock → NginxBlock → Challenge → Allow; the first +matching pattern wins. + `access_granted()` returns a response with a header: `X-Accel-Redirect: @access_granted` which instructs Nginx to perform an internal redirect to the location block named `@access_granted`. That block should `proxy_pass` to the upstream origin site. @@ -106,9 +130,9 @@ else: return 401, challenge page + new cookie ``` -Note that this will currently serve an unlimited number of challenges to a bot that isn't solving them. -banjax's predecessor would eventually block this kind of bot at the iptables level, but there are some -intel-gathering benefits in not doing that. We will probably want to rate-limit this, though. [XXX todo] +If a client fails too many challenges (exceeding `too_many_failed_challenges_threshold`), they are +blocked at the iptables level (or nginx level if the IP is in a per-site allowlist). This rate-limiting +applies to both SHA-inverse and password challenges. ### Regex-based rate-limits @@ -149,7 +173,6 @@ for log_line in lines(log_file): The actual code has some extra stuff to deal with adding/removing iptables rules and clearing stale decisions from the Nginx cache. -[XXX todo] banjax's predecessor never unblocked an IP after it triggered a rule (it would stay in effect -until ATS restarted). We probably want to add an explicit time limit somewhere (per rule or global?). Also, note -to self to be careful here: when I delete a regex-triggered Decision, it should probably restore any Decision -that might have been loaded from the config file. +Regex-triggered decisions are stored in the expiring decision list with a TTL controlled by +`expiring_decision_ttl_seconds`. After the TTL expires, the decision is removed and any static +decision from the config file takes effect again. diff --git a/banjax-config.yaml b/banjax-config.yaml index d30eb1e..3025159 100644 --- a/banjax-config.yaml +++ b/banjax-config.yaml @@ -147,3 +147,20 @@ dnet: dnext1 dnet_to_partition: dnext1: 0 dnext2: 1 + +per_site_user_agent_decision_lists: + "localhost": + nginx_block: + - "AhrefsBot" + challenge: + - "Macintosh.*Firefox/\\d+" + allow: + - "Googlebot" + +global_user_agent_decision_lists: + nginx_block: + - "SemrushBot" + challenge: + - ".*Firefox/\\d+" + allow: + - "Googlebot" diff --git a/banjax_base_test.go b/banjax_base_test.go index 14d2440..91a8245 100644 --- a/banjax_base_test.go +++ b/banjax_base_test.go @@ -24,6 +24,7 @@ const fixtureConfigTestShaInv = "./fixtures/banjax-config-test-sha-inv.yaml" const fixtureConfigTestRegexBanner = "./fixtures/banjax-config-test-regex-banner.yaml" const fixtureConfigTestReloadCIDR = "./fixtures/banjax-config-test-reload-cidr.yaml" const fixtureConfigTestPersiteFail = "./fixtures/banjax-config-test-persite-fail.yaml" +const fixtureConfigTestUA = "./fixtures/banjax-config-test-ua.yaml" var tmpDir string var configFile string @@ -194,6 +195,17 @@ func ClientIP(ip string) http.Header { return http.Header{"X-Client-IP": {ip}} } +func ClientUserAgent(ua string) http.Header { + return http.Header{"X-Client-User-Agent": {ua}} +} + +func ClientIPAndUserAgent(ip string, ua string) http.Header { + return http.Header{ + "X-Client-IP": {ip}, + "X-Client-User-Agent": {ua}, + } +} + func randomIP() string { octets := []string{} for i := 0; i < 4; i++ { diff --git a/banjax_integration_test.go b/banjax_integration_test.go index 5fcd550..efe75bf 100644 --- a/banjax_integration_test.go +++ b/banjax_integration_test.go @@ -405,3 +405,59 @@ func TestRegexesWithRatesAllowList(t *testing.T) { {"GET", prefix + "/blockme/", 200, ClientIP("20.20.20.20"), nil}, }) } + +func TestGlobalUserAgentDecisionLists(t *testing.T) { + defer reloadConfig(fixtureConfigTest, 1, t) + + reloadConfig(fixtureConfigTestUA, 1, t) + + /* + global_user_agent_decision_lists: + nginx_block: + - "AhrefsBot" + - "SemrushBot" + challenge: + - "Macintosh.*Firefox/\\d+" + */ + prefix := "/auth_request?path=" + httpTester(t, []TestResource{ + {"GET", "/info", 200, nil, []string{"2025-01-01"}}, + // AhrefsBot is globally nginx_blocked (403) + {"GET", prefix + "/ua_ahref", 403, ClientUserAgent("Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"), nil}, + // SemrushBot is globally nginx_blocked (403) + {"GET", prefix + "/ua_semrush", 403, ClientUserAgent("Mozilla/5.0 (compatible; SemrushBot/7.0; +http://www.semrush.com/bot.html)"), nil}, + // Firefox on Mac is globally challenged (429) + {"GET", prefix + "/ua_firefox_mac", 429, ClientUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:149.0) Gecko/20100101 Firefox/149.0"), nil}, + // Firefox on Windows does not match the Macintosh pattern — allowed (200) + {"GET", prefix + "/ua_firefox_win", 200, ClientUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:149.0) Gecko/20100101 Firefox/149.0"), nil}, + // Googlebot has no UA rule — allowed (200) + {"GET", prefix + "/ua_googlebot", 200, ClientUserAgent("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"), nil}, + }) +} + +func TestPerSiteUserAgentDecisionLists(t *testing.T) { + defer reloadConfig(fixtureConfigTest, 1, t) + + reloadConfig(fixtureConfigTestUA, 1, t) + + /* + per_site_user_agent_decision_lists: + "localhost:8081": + allow: + - "GPTBot" + + global_decision_lists: + challenge: + - 8.8.8.8 + */ + prefix := "/auth_request?path=" + httpTester(t, []TestResource{ + {"GET", "/info", 200, nil, []string{"2025-01-01"}}, + // 8.8.8.8 is in global challenge IP list — should be challenged without a UA override + {"GET", prefix + "/ua_ip_challenge", 429, ClientIP("8.8.8.8"), nil}, + // GPTBot from 8.8.8.8: per-site UA allow overrides the global IP challenge + {"GET", prefix + "/ua_gptbot_override", 200, ClientIPAndUserAgent("8.8.8.8", "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"), nil}, + // AhrefsBot from 8.8.8.8: global IP challenge fires before global UA block (per-site UA has no AhrefsBot rule) + {"GET", prefix + "/ua_ahref_challenged_ip", 429, ClientIPAndUserAgent("8.8.8.8", "Mozilla/5.0 (compatible; AhrefsBot/7.0)"), nil}, + }) +} diff --git a/fixtures/banjax-config-test-ua.yaml b/fixtures/banjax-config-test-ua.yaml new file mode 100644 index 0000000..3dcba0d --- /dev/null +++ b/fixtures/banjax-config-test-ua.yaml @@ -0,0 +1,55 @@ +config_version: 2025-01-01_00:00:00 +global_decision_lists: + allow: + - 20.20.20.20 + nginx_block: + - 70.80.90.100 + challenge: + - 8.8.8.8 +iptables_ban_seconds: 10 +iptables_unbanner_seconds: 5 +kafka_brokers: + - "localhost:9092" +kafka_security_protocol: 'ssl' +kafka_ssl_ca: "/etc/banjax/caroot.pem" +kafka_ssl_key: "/etc/banjax/key.pem" +kafka_ssl_key_password: password +kafka_report_topic: 'banjax_report_topic' +kafka_command_topic: 'banjax_command_topic' +password_protected_paths: + "localhost": + - wp-admin +password_protected_path_exceptions: + "localhost": + - wp-admin/admin-ajax.php +password_hashes: + "localhost:8081": "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" + "localhost": "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" +sitewide_sha_inv_list: + example.com: block + foobar.com: no_block +server_log_file: /var/log/banjax/banjax-format.log +banning_log_file: /etc/banjax/ban_ip_list.log +expiring_decision_ttl_seconds: 10 +too_many_failed_challenges_interval_seconds: 10 +too_many_failed_challenges_threshold: 6 +password_cookie_ttl_seconds: 14400 +sha_inv_cookie_ttl_seconds: 14400 +hmac_secret: secret +gin_log_file: /var/log/banjax/gin.log +metrics_log_file: /var/log/banjax/metrics.log +standalone_testing: true + +# UA blocking: AhrefsBot globally blocked, Firefox on Mac globally challenged +# GPTBot is allowed on localhost:8081 (per-site override of the global IP challenge for 8.8.8.8) +global_user_agent_decision_lists: + nginx_block: + - "AhrefsBot" + - "SemrushBot" + challenge: + - "Macintosh.*Firefox/\\d+" + +per_site_user_agent_decision_lists: + "localhost:8081": + allow: + - "GPTBot" diff --git a/internal/config.go b/internal/config.go index d1297aa..79ca213 100644 --- a/internal/config.go +++ b/internal/config.go @@ -80,6 +80,8 @@ type Config struct { SitesToShaInvPathExceptions map[string][]string `yaml:"sha_inv_path_exceptions"` DNet string `yaml:"dnet"` DNetToPartition map[string]int `yaml:"dnet_to_partition"` + PerSiteUserAgentDecisionLists map[string]map[string][]string `yaml:"per_site_user_agent_decision_lists"` + GlobalUserAgentDecisionLists map[string][]string `yaml:"global_user_agent_decision_lists"` } type RegexWithRate struct { diff --git a/internal/decision.go b/internal/decision.go index 766f958..d2e8261 100644 --- a/internal/decision.go +++ b/internal/decision.go @@ -161,6 +161,20 @@ func (l *StaticDecisionLists) CheckGlobal(config *Config, clientIp string) (Deci } } +func (l *StaticDecisionLists) CheckPerSiteUserAgent(site string, userAgent string) (Decision, bool) { + c := l.content.Load() + rules, ok := c.perSiteUserAgentDecisionLists[site] + if !ok { + return 0, false + } + return checkUADecision(rules, userAgent) +} + +func (l *StaticDecisionLists) CheckGlobalUserAgent(userAgent string) (Decision, bool) { + c := l.content.Load() + return checkUADecision(c.globalUserAgentDecisionLists, userAgent) +} + func (l *StaticDecisionLists) CheckSitewideShaInv(site string) (FailAction, bool) { c := l.content.Load() @@ -245,15 +259,19 @@ type staticDecisionLists struct { sitewideShaInvList siteToFailAction globalDecisionListsIPFilter decisionToIPFilter perSiteDecisionListsIPFilter siteToDecisionToIPFilter + perSiteUserAgentDecisionLists perSiteUAPatternToDecision + globalUserAgentDecisionLists globalUAPatternToDecision } func newStaticDecisionLists() staticDecisionLists { return staticDecisionLists{ - globalDecisionLists: make(ipAddrToDecision), - perSiteDecisionLists: make(siteToIPAddrToDecision), - sitewideShaInvList: make(siteToFailAction), - globalDecisionListsIPFilter: make(decisionToIPFilter), - perSiteDecisionListsIPFilter: make(siteToDecisionToIPFilter), + globalDecisionLists: make(ipAddrToDecision), + perSiteDecisionLists: make(siteToIPAddrToDecision), + sitewideShaInvList: make(siteToFailAction), + globalDecisionListsIPFilter: make(decisionToIPFilter), + perSiteDecisionListsIPFilter: make(siteToDecisionToIPFilter), + perSiteUserAgentDecisionLists: make(perSiteUAPatternToDecision), + globalUserAgentDecisionLists: make(globalUAPatternToDecision), } } @@ -333,6 +351,22 @@ func newStaticDecisionListsFromConfig(config *Config) (staticDecisionLists, erro out.sitewideShaInvList[site] = failAction } + if len(config.GlobalUserAgentDecisionLists) > 0 { + globalUA, err := buildGlobalUAPatternToDecision(config.GlobalUserAgentDecisionLists) + if err != nil { + return staticDecisionLists{}, fmt.Errorf("failed to build global user agent decision lists: %w", err) + } + out.globalUserAgentDecisionLists = globalUA + } + + if len(config.PerSiteUserAgentDecisionLists) > 0 { + perSiteUA, err := buildPerSiteUAPatternToDecision(config.PerSiteUserAgentDecisionLists) + if err != nil { + return staticDecisionLists{}, fmt.Errorf("failed to build per-site user agent decision lists: %w", err) + } + out.perSiteUserAgentDecisionLists = perSiteUA + } + log.Printf("global decisions: %v\n", out.globalDecisionLists) log.Printf("per-site decisions: %v\n", out.perSiteDecisionLists) diff --git a/internal/http_server.go b/internal/http_server.go index 0705252..282dc3c 100644 --- a/internal/http_server.go +++ b/internal/http_server.go @@ -338,7 +338,9 @@ func addOurXHeadersForTesting(c *gin.Context) { } c.Request.Header.Set("X-Requested-Host", c.Request.Host) c.Request.Header.Set("X-Requested-Path", c.Query("path")) - c.Request.Header.Set("X-Client-User-Agent", "mozilla") + if c.Request.Header.Get("X-Client-User-Agent") == "" { + c.Request.Header.Set("X-Client-User-Agent", "mozilla") + } c.Next() } @@ -761,6 +763,12 @@ const ( PerSiteShaInvPathException SiteWideChallenge SiteWideChallengeException + PerSiteUAAccessGranted + PerSiteUAChallenge + PerSiteUABlock + GlobalUAAccessGranted + GlobalUAChallenge + GlobalUABlock NoMention NotSet ) @@ -781,6 +789,12 @@ var DecisionListResultToString = map[DecisionListResult]string{ PerSiteShaInvPathException: "PerSiteShaInvPathException", SiteWideChallenge: "SiteWideChallenge", SiteWideChallengeException: "SiteWideChallengeException", + PerSiteUAAccessGranted: "PerSiteUAAccessGranted", + PerSiteUAChallenge: "PerSiteUAChallenge", + PerSiteUABlock: "PerSiteUABlock", + GlobalUAAccessGranted: "GlobalUAAccessGranted", + GlobalUAChallenge: "GlobalUAChallenge", + GlobalUABlock: "GlobalUABlock", NoMention: "NoMention", NotSet: "NotSet", } @@ -807,6 +821,7 @@ type DecisionForNginxResult struct { PasswordChallengeResult *PasswordChallengeResult // these are pointers so they can be optionally nil ShaChallengeResult *ShaChallengeResult TooManyFailedChallengesResult *RateLimitResult + ClientUserAgent string } func decisionForNginx( @@ -856,6 +871,7 @@ func decisionForNginx2( clientIp := c.Request.Header.Get("X-Client-IP") requestedHost := c.Request.Header.Get("X-Requested-Host") requestedPath := c.Request.Header.Get("X-Requested-Path") + clientUserAgent := c.Request.Header.Get("X-Client-User-Agent") requestedProtectedPath := CleanRequestedPath(requestedPath) // log.Println("clientIp: ", clientIp, " requestedHost: ", requestedHost, " requestedPath: ", requestedPath) @@ -864,6 +880,7 @@ func decisionForNginx2( decisionForNginxResult.RequestedHost = requestedHost decisionForNginxResult.RequestedPath = requestedPath decisionForNginxResult.DecisionListResult = NotSet + decisionForNginxResult.ClientUserAgent = clientUserAgent // check if user has a valid password cookie, if so, allow them through passwordCookie, passwordCookieErr := c.Cookie(PasswordCookieName) @@ -946,6 +963,33 @@ func decisionForNginx2( } } + uaDecision, foundInPerSiteUA := staticDecisionLists.CheckPerSiteUserAgent(requestedHost, clientUserAgent) + if foundInPerSiteUA { + switch uaDecision { + case Allow: + accessGranted(c, config, DecisionListResultToString[PerSiteUAAccessGranted], -1.0, "", IntegrityCheckPayloadWrapper{}) + decisionForNginxResult.DecisionListResult = PerSiteUAAccessGranted + return + case Challenge: + sendOrValidateShaChallengeResult := sendOrValidateShaChallenge( + config, + c, + banner, + failedChallengeStates, + Block, // FailAction + staticDecisionLists, + ) + decisionForNginxResult.DecisionListResult = PerSiteUAChallenge + decisionForNginxResult.ShaChallengeResult = &sendOrValidateShaChallengeResult.ShaChallengeResult + decisionForNginxResult.TooManyFailedChallengesResult = &sendOrValidateShaChallengeResult.TooManyFailedChallengesResult + return + case NginxBlock, IptablesBlock: + accessDenied(c, config, DecisionListResultToString[PerSiteUABlock], -1.0, "", IntegrityCheckPayloadWrapper{}) + decisionForNginxResult.DecisionListResult = PerSiteUABlock + return + } + } + decision, foundInGlobalList := staticDecisionLists.CheckGlobal(config, clientIp) if foundInGlobalList { switch decision { @@ -976,6 +1020,33 @@ func decisionForNginx2( } } + uaDecisionGlobal, foundInGlobalUA := staticDecisionLists.CheckGlobalUserAgent(clientUserAgent) + if foundInGlobalUA { + switch uaDecisionGlobal { + case Allow: + accessGranted(c, config, DecisionListResultToString[GlobalUAAccessGranted], -1.0, "", IntegrityCheckPayloadWrapper{}) + decisionForNginxResult.DecisionListResult = GlobalUAAccessGranted + return + case Challenge: + sendOrValidateShaChallengeResult := sendOrValidateShaChallenge( + config, + c, + banner, + failedChallengeStates, + Block, // FailAction + staticDecisionLists, + ) + decisionForNginxResult.DecisionListResult = GlobalUAChallenge + decisionForNginxResult.ShaChallengeResult = &sendOrValidateShaChallengeResult.ShaChallengeResult + decisionForNginxResult.TooManyFailedChallengesResult = &sendOrValidateShaChallengeResult.TooManyFailedChallengesResult + return + case NginxBlock, IptablesBlock: + accessDenied(c, config, DecisionListResultToString[GlobalUABlock], -1.0, "", IntegrityCheckPayloadWrapper{}) + decisionForNginxResult.DecisionListResult = GlobalUABlock + return + } + } + // i think this needs to point to a struct {decision: Decision, expires: Time}. // when we insert something into the list, really we might just be extending the expiry time and/or // changing the decision. diff --git a/internal/user_agent_decision.go b/internal/user_agent_decision.go new file mode 100644 index 0000000..11cdd26 --- /dev/null +++ b/internal/user_agent_decision.go @@ -0,0 +1,96 @@ +// Copyright (c) 2025, eQualit.ie inc. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +package internal + +import ( + "fmt" + "regexp" + "strings" +) + +// uaPattern holds a pre-compiled optional regex alongside the raw pattern string. +// If compiledRegex is nil, substring matching is used. +type uaPattern struct { + raw string + compiledRegex *regexp.Regexp +} + +func newUAPattern(raw string) (uaPattern, error) { + // Attempt to detect if the pattern is intended as a regex by trying to compile it. + // Simple substrings that happen to be valid regex (e.g. "GPTBot") compile fine and + // strings.Contains will be used for them since they contain no metacharacters. + // We only use regex when the string contains metacharacters. + if containsRegexMetachar(raw) { + compiled, err := regexp.Compile(raw) + if err != nil { + return uaPattern{}, fmt.Errorf("invalid UA regex pattern %q: %w", raw, err) + } + return uaPattern{raw: raw, compiledRegex: compiled}, nil + } + return uaPattern{raw: raw}, nil +} + +func containsRegexMetachar(s string) bool { + return strings.ContainsAny(s, `\.+*?[]{}()|^$`) +} + +func matchUserAgent(p uaPattern, userAgent string) bool { + if p.compiledRegex != nil { + return p.compiledRegex.MatchString(userAgent) + } + return strings.Contains(userAgent, p.raw) +} + +// globalUAPatternToDecision maps decision → []pattern for global UA rules. +type globalUAPatternToDecision map[Decision][]uaPattern + +// perSiteUAPatternToDecision maps site → decision → []pattern for per-site UA rules. +type perSiteUAPatternToDecision map[string]globalUAPatternToDecision + +// checkUADecision iterates decisions in severity order and returns the first match. +func checkUADecision(rules globalUAPatternToDecision, userAgent string) (Decision, bool) { + for _, d := range []Decision{IptablesBlock, NginxBlock, Challenge, Allow} { + for _, p := range rules[d] { + if matchUserAgent(p, userAgent) { + return d, true + } + } + } + return 0, false +} + +// buildGlobalUAPatternToDecision builds a globalUAPatternToDecision from the raw config map. +func buildGlobalUAPatternToDecision(raw map[string][]string) (globalUAPatternToDecision, error) { + out := make(globalUAPatternToDecision) + for decisionString, patterns := range raw { + decision, err := ParseDecision(decisionString) + if err != nil { + return nil, fmt.Errorf("user_agent_decision_lists: %w", err) + } + for _, rawPattern := range patterns { + p, err := newUAPattern(rawPattern) + if err != nil { + return nil, err + } + out[decision] = append(out[decision], p) + } + } + return out, nil +} + +// buildPerSiteUAPatternToDecision builds a perSiteUAPatternToDecision from the raw config map. +func buildPerSiteUAPatternToDecision(raw map[string]map[string][]string) (perSiteUAPatternToDecision, error) { + out := make(perSiteUAPatternToDecision) + for site, decisionToPatterns := range raw { + global, err := buildGlobalUAPatternToDecision(decisionToPatterns) + if err != nil { + return nil, fmt.Errorf("per_site_user_agent_decision_lists[%s]: %w", site, err) + } + out[site] = global + } + return out, nil +} diff --git a/internal/user_agent_decision_test.go b/internal/user_agent_decision_test.go new file mode 100644 index 0000000..11ab82a --- /dev/null +++ b/internal/user_agent_decision_test.go @@ -0,0 +1,187 @@ +// Copyright (c) 2025, eQualit.ie inc. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +package internal + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// --- uaPattern / matchUserAgent --- + +func TestMatchUserAgent_Substring(t *testing.T) { + p, err := newUAPattern("GPTBot") + assert.Nil(t, err) + assert.Nil(t, p.compiledRegex) + + assert.True(t, matchUserAgent(p, "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)")) + assert.False(t, matchUserAgent(p, "Mozilla/5.0 (compatible; Googlebot/2.1)")) +} + +func TestMatchUserAgent_Regex(t *testing.T) { + p, err := newUAPattern(`Macintosh.*Firefox/\d+`) + assert.Nil(t, err) + assert.NotNil(t, p.compiledRegex) + + assert.True(t, matchUserAgent(p, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:149.0) Gecko/20100101 Firefox/149.0")) + assert.False(t, matchUserAgent(p, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:149.0) Gecko/20100101 Firefox/149.0")) +} + +func TestMatchUserAgent_RegexCaseInsensitive(t *testing.T) { + p, err := newUAPattern(`(?i)scrapy|mechanize`) + assert.Nil(t, err) + assert.NotNil(t, p.compiledRegex) + + assert.True(t, matchUserAgent(p, "Scrapy/2.11.2 (+https://scrapy.org)")) + assert.True(t, matchUserAgent(p, "Python-Mechanize/0.4.9")) + assert.False(t, matchUserAgent(p, "Mozilla/5.0 (compatible; Googlebot/2.1)")) +} + +func TestNewUAPattern_InvalidRegex(t *testing.T) { + _, err := newUAPattern(`(?invalid`) + assert.NotNil(t, err) +} + +// --- checkUADecision severity order --- + +func TestCheckUADecision_SeverityOrder(t *testing.T) { + // Both Allow and NginxBlock match "TestBot" — NginxBlock should win (higher severity) + allowPattern, _ := newUAPattern("TestBot") + blockPattern, _ := newUAPattern("TestBot") + rules := globalUAPatternToDecision{ + Allow: []uaPattern{allowPattern}, + NginxBlock: []uaPattern{blockPattern}, + } + + decision, ok := checkUADecision(rules, "TestBot/1.0") + assert.True(t, ok) + assert.Equal(t, NginxBlock, decision) +} + +func TestCheckUADecision_NoMatch(t *testing.T) { + p, _ := newUAPattern("AhrefsBot") + rules := globalUAPatternToDecision{ + NginxBlock: []uaPattern{p}, + } + + _, ok := checkUADecision(rules, "Mozilla/5.0 (compatible; Googlebot/2.1)") + assert.False(t, ok) +} + +// --- StaticDecisionLists UA checks via config --- + +const uaDecisionListsConfString = ` +global_user_agent_decision_lists: + nginx_block: + - "AhrefsBot" + - "SemrushBot" + challenge: + - "(?i)scrapy|mechanize" + allow: + - "Googlebot" +per_site_user_agent_decision_lists: + "example.com": + allow: + - "GPTBot" + nginx_block: + - "AhrefsBot" + "other.com": + challenge: + - "Macintosh.*Firefox/\\d+" +` + +func TestCheckGlobalUserAgent(t *testing.T) { + config := loadConfigString(uaDecisionListsConfString) + lists, err := NewStaticDecisionLists(config) + assert.Nil(t, err) + + decision, ok := lists.CheckGlobalUserAgent("Mozilla/5.0 (compatible; AhrefsBot/7.0)") + assert.True(t, ok) + assert.Equal(t, NginxBlock, decision) + + decision, ok = lists.CheckGlobalUserAgent("Mozilla/5.0 (compatible; SemrushBot/7.0)") + assert.True(t, ok) + assert.Equal(t, NginxBlock, decision) + + decision, ok = lists.CheckGlobalUserAgent("Scrapy/2.11.2 (+https://scrapy.org)") + assert.True(t, ok) + assert.Equal(t, Challenge, decision) + + decision, ok = lists.CheckGlobalUserAgent("Mozilla/5.0 (compatible; Googlebot/2.1)") + assert.True(t, ok) + assert.Equal(t, Allow, decision) + + _, ok = lists.CheckGlobalUserAgent("Mozilla/5.0 (compatible; GPTBot/1.0)") + assert.False(t, ok) +} + +func TestCheckPerSiteUserAgent(t *testing.T) { + config := loadConfigString(uaDecisionListsConfString) + lists, err := NewStaticDecisionLists(config) + assert.Nil(t, err) + + // GPTBot is allowed on example.com (per-site override) + decision, ok := lists.CheckPerSiteUserAgent("example.com", "Mozilla/5.0 (compatible; GPTBot/1.0)") + assert.True(t, ok) + assert.Equal(t, Allow, decision) + + // AhrefsBot is blocked on example.com + decision, ok = lists.CheckPerSiteUserAgent("example.com", "Mozilla/5.0 (compatible; AhrefsBot/7.0)") + assert.True(t, ok) + assert.Equal(t, NginxBlock, decision) + + // Firefox on Mac is challenged on other.com + decision, ok = lists.CheckPerSiteUserAgent("other.com", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:149.0) Gecko/20100101 Firefox/149.0") + assert.True(t, ok) + assert.Equal(t, Challenge, decision) + + // Firefox on Windows does not match the Macintosh pattern + _, ok = lists.CheckPerSiteUserAgent("other.com", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:149.0) Gecko/20100101 Firefox/149.0") + assert.False(t, ok) + + // No per-site rules for unknown.com + _, ok = lists.CheckPerSiteUserAgent("unknown.com", "Mozilla/5.0 (compatible; AhrefsBot/7.0)") + assert.False(t, ok) +} + +func TestCheckPerSiteUserAgent_NoRulesForSite(t *testing.T) { + config := loadConfigString(uaDecisionListsConfString) + lists, err := NewStaticDecisionLists(config) + assert.Nil(t, err) + + _, ok := lists.CheckPerSiteUserAgent("notconfigured.com", "anything") + assert.False(t, ok) +} + +// --- Config parsing with invalid decision --- + +const uaBadDecisionConfString = ` +global_user_agent_decision_lists: + bad_decision: + - "SomeBot" +` + +func TestNewStaticDecisionLists_InvalidUADecision(t *testing.T) { + config := loadConfigString(uaBadDecisionConfString) + _, err := NewStaticDecisionLists(config) + assert.NotNil(t, err) +} + +// --- Config parsing with invalid regex --- + +const uaBadRegexConfString = ` +global_user_agent_decision_lists: + nginx_block: + - "(?invalid" +` + +func TestNewStaticDecisionLists_InvalidUARegex(t *testing.T) { + config := loadConfigString(uaBadRegexConfString) + _, err := NewStaticDecisionLists(config) + assert.NotNil(t, err) +}