Skip to content

Commit 9a2e98f

Browse files
committed
document ranking pipeline
1 parent fe10bc0 commit 9a2e98f

File tree

12 files changed

+88
-16
lines changed

12 files changed

+88
-16
lines changed

crates/core/src/ranking/computer/mod.rs

+13
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,19 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! The ranking computer is responsible for computing the core ranking signals for
18+
//! each potential page in the result set. This module handles the initial ranking phase
19+
//! that runs independently on each search node in the distributed search cluster.
20+
//!
21+
//! The computer evaluates a set of core ranking signals for each candidate page,
22+
//! including text-based relevance scores like BM25 and authority scores (harmonic centrality).
23+
//! These signals are combined using a linear model to produce an initial ranking score.
24+
//! The top pages are passed to the coordinator node for the final ranking phase.
25+
//!
26+
//! The core signals computed here are designed to be fast to calculate while still
27+
//! providing strong relevance signals. More expensive ranking features are deferred
28+
//! to the final ranking phase on the coordinator.
29+
1730
use crate::query::optic::AsSearchableRule;
1831
use crate::query::{Query, MAX_TERMS_FOR_NGRAM_LOOKUPS};
1932
use crate::ranking::bm25f::MultiBm25FWeight;

crates/core/src/ranking/mod.rs

+6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! The ranking module is responsible for ranking pages based on their relevance to a query.
18+
//!
19+
//! The core ranking signals are computed by the `computer` module, which runs independently
20+
//! on each search shard in the search cluster. Increasingly complex stages
21+
//! run in the ranking pipeline on the coordinator node to produce the final ranking.
22+
1723
pub mod bitvec_similarity;
1824
pub mod bm25;
1925
pub mod bm25f;

crates/core/src/ranking/models/cross_encoder.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Stract is an open source web search engine.
2-
// Copyright (C) 2023 Stract ApS
2+
// Copyright (C) 2024 Stract ApS
33
//
44
// This program is free software: you can redistribute it and/or modify
55
// it under the terms of the GNU Affero General Public License as
@@ -29,6 +29,9 @@ use crate::models::bert::BertModel;
2929

3030
const TRUNCATE_INPUT: usize = 128;
3131

32+
/// A cross-encoder model for ranking pages.
33+
///
34+
/// Takes a query and a page body as input and returns a score for the page.
3235
pub struct CrossEncoderModel {
3336
tokenizer: tokenizers::Tokenizer,
3437
encoder: BertModel,

crates/core/src/ranking/models/lambdamart.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Stract is an open source web search engine.
2-
// Copyright (C) 2023 Stract ApS
2+
// Copyright (C) 2024 Stract ApS
33
//
44
// This program is free software: you can redistribute it and/or modify
55
// it under the terms of the GNU Affero General Public License as
@@ -240,6 +240,9 @@ impl Header {
240240
}
241241
}
242242

243+
/// A LambdaMART model for ranking pages.
244+
///
245+
/// Designed for efficient inference of lightgbm compatible models.
243246
pub struct LambdaMART {
244247
trees: Vec<Tree>,
245248
}

crates/core/src/ranking/models/linear.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Stract is an open source web search engine.
2-
// Copyright (C) 2023 Stract ApS
2+
// Copyright (C) 2024 Stract ApS
33
//
44
// This program is free software: you can redistribute it and/or modify
55
// it under the terms of the GNU Affero General Public License as

crates/core/src/ranking/pipeline/mod.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ impl<T> StageOrModifier<T>
6868
where
6969
T: RankableWebpage + Send + Sync,
7070
{
71-
fn top_n(&self) -> Top {
71+
fn top(&self) -> Top {
7272
match self {
73-
StageOrModifier::Stage(stage) => stage.top_n(),
74-
StageOrModifier::Modifier(modifier) => modifier.top_n(),
73+
StageOrModifier::Stage(stage) => stage.top(),
74+
StageOrModifier::Modifier(modifier) => modifier.top(),
7575
}
7676
}
7777

@@ -139,7 +139,7 @@ where
139139
let coefficients = query.signal_coefficients();
140140

141141
for stage_or_modifier in self.stages_or_modifiers.iter() {
142-
let webpages = if let Top::Limit(top_n) = stage_or_modifier.top_n() {
142+
let webpages = if let Top::Limit(top_n) = stage_or_modifier.top() {
143143
if query.offset() > top_n {
144144
continue;
145145
}

crates/core/src/ranking/pipeline/modifiers/mod.rs

+22-2
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,48 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! Modifiers are used to modify the ranking of pages.
18+
//!
19+
//! Each page is ranked by a linear combination of the signals like
20+
//! `score = boost * (signal_1 * weight_1 + signal_2 * weight_2 + ...)`
21+
//!
22+
//! Modifiers can either modify the multiplicative boost factor for
23+
//! each page or override the ranking entirely (if we want to rank
24+
//! for something other than the score).
25+
1726
mod inbound_similarity;
1827

1928
use super::{RankableWebpage, Top};
2029
pub use inbound_similarity::InboundSimilarity;
2130

31+
/// A modifier that gives full control over the ranking.
2232
pub trait FullModifier: Send + Sync {
2333
type Webpage: RankableWebpage;
34+
/// Modify the boost factor for each page.
2435
fn update_boosts(&self, webpages: &mut [Self::Webpage]);
2536

37+
/// Override ranking of the pages.
2638
fn rank(&self, webpages: &mut [Self::Webpage]) {
2739
webpages.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap());
2840
}
2941

30-
fn top_n(&self) -> Top {
42+
/// The number of pages to return from this part of the pipeline.
43+
fn top(&self) -> Top {
3144
Top::Unlimited
3245
}
3346
}
3447

48+
/// A modifier that modifies the multiplicative boost factor for each page.
49+
///
50+
/// This is the most common type of modifier.
3551
pub trait Modifier: Send + Sync {
3652
type Webpage: RankableWebpage;
53+
/// Modify the boost factor for a page.
54+
///
55+
/// The new boost factor will be multiplied with the page's current boost factor.
3756
fn boost(&self, webpage: &Self::Webpage) -> f64;
3857

58+
/// The number of pages to return from this part of the pipeline.
3959
fn top(&self) -> Top {
4060
Top::Unlimited
4161
}
@@ -54,7 +74,7 @@ where
5474
}
5575
}
5676

57-
fn top_n(&self) -> Top {
77+
fn top(&self) -> Top {
5878
Modifier::top(self)
5979
}
6080
}

crates/core/src/ranking/pipeline/scorers/lambdamart.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ impl RankingStage for Arc<models::LambdaMART> {
3636
)
3737
}
3838

39-
fn top_n(&self) -> Top {
39+
fn top(&self) -> Top {
4040
Top::Limit(20)
4141
}
4242
}
@@ -59,7 +59,7 @@ impl RankingStage for PrecisionLambda {
5959
)
6060
}
6161

62-
fn top_n(&self) -> Top {
62+
fn top(&self) -> Top {
6363
Top::Limit(20)
6464
}
6565
}

crates/core/src/ranking/pipeline/scorers/mod.rs

+22-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! Scorers are used to compute the ranking signals in the ranking pipeline.
18+
//!
19+
//! Each scorer computes a single signal which is then used to rank the pages.
20+
1721
pub mod embedding;
1822
pub mod inbound_similarity;
1923
pub mod lambdamart;
@@ -26,14 +30,23 @@ use crate::ranking::{SignalCalculation, SignalCoefficients, SignalEnum};
2630

2731
use super::{RankableWebpage, Top};
2832

33+
/// A ranking stage that computes some signals for each page.
34+
///
35+
/// This trait is implemented for all scorers.
36+
/// Most of the time you will want to implement the [`RankingStage`] trait instead,
37+
/// but this trait gives you more control over the ranking pipeline.
2938
pub trait FullRankingStage: Send + Sync {
3039
type Webpage: RankableWebpage;
3140

41+
/// Compute the signal for each page.
3242
fn compute(&self, webpages: &mut [Self::Webpage]);
33-
fn top_n(&self) -> Top {
43+
44+
/// The number of pages to return from this part of the pipeline.
45+
fn top(&self) -> Top {
3446
Top::Unlimited
3547
}
3648

49+
/// Update the score for each page.
3750
fn update_scores(&self, webpages: &mut [Self::Webpage], coefficients: &SignalCoefficients) {
3851
for webpage in webpages.iter_mut() {
3952
webpage.set_raw_score(webpage.signals().iter().fold(0.0, |acc, (signal, calc)| {
@@ -42,16 +55,21 @@ pub trait FullRankingStage: Send + Sync {
4255
}
4356
}
4457

58+
/// Rank the pages by their score.
4559
fn rank(&self, webpages: &mut [Self::Webpage]) {
4660
webpages.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap());
4761
}
4862
}
4963

64+
/// A ranking stage that computes a single signal for each page.
5065
pub trait RankingStage: Send + Sync {
5166
type Webpage: RankableWebpage;
5267

68+
/// Compute the signal for a single page.
5369
fn compute(&self, webpage: &Self::Webpage) -> (SignalEnum, SignalCalculation);
54-
fn top_n(&self) -> Top {
70+
71+
/// The number of pages to return from this part of the pipeline.
72+
fn top(&self) -> Top {
5573
Top::Unlimited
5674
}
5775
}
@@ -69,7 +87,7 @@ where
6987
}
7088
}
7189

72-
fn top_n(&self) -> Top {
73-
self.top_n()
90+
fn top(&self) -> Top {
91+
self.top()
7492
}
7593
}

crates/core/src/ranking/pipeline/scorers/reranker.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ impl<M: CrossEncoder> FullRankingStage for ReRanker<M> {
6868
self.crossencoder_score_webpages(webpages);
6969
}
7070

71-
fn top_n(&self) -> Top {
71+
fn top(&self) -> Top {
7272
Top::Limit(20)
7373
}
7474
}

crates/core/src/ranking/pipeline/stages/precision.rs

+5
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! The precision stage of the ranking pipeline.
18+
//!
19+
//! This stage focusses on refining the first page of results
20+
//! from the recall stage.
21+
1722
use std::sync::Arc;
1823

1924
use crate::{

crates/core/src/ranking/pipeline/stages/recall.rs

+4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! The recall stage of the ranking pipeline.
18+
//!
19+
//! This stage focusses on getting the best pages into the precision stage.
20+
1721
use std::sync::Arc;
1822

1923
use crate::{

0 commit comments

Comments
 (0)