diff --git a/browser/src/GenePage/GeneFlags.spec.tsx b/browser/src/GenePage/GeneFlags.spec.tsx index 45d29ef28..b10c4400a 100644 --- a/browser/src/GenePage/GeneFlags.spec.tsx +++ b/browser/src/GenePage/GeneFlags.spec.tsx @@ -29,4 +29,12 @@ describe('GeneFlags', () => { expect(tree).toMatchSnapshot() }) + + test('renders VEP 115 warning for RNU4ATAC', () => { + const testGene = geneFactory.build({ symbol: 'RNU4ATAC', reference_genome: 'GRCh38' }) + + const tree = renderer.create() + + expect(tree).toMatchSnapshot() + }) }) diff --git a/browser/src/GenePage/GeneFlags.tsx b/browser/src/GenePage/GeneFlags.tsx index 3150800a8..dcf08e848 100644 --- a/browser/src/GenePage/GeneFlags.tsx +++ b/browser/src/GenePage/GeneFlags.tsx @@ -13,11 +13,15 @@ type Props = { } const allOfUsCMRGGenes = ['CBS', 'KCNE1', 'CRYAA'] +const vep115Genes = ['RNU4ATAC'] const GeneFlags = ({ gene }: Props) => { const shouldDisplayCMRGWarning = gene.reference_genome === 'GRCh38' && allOfUsCMRGGenes.includes(gene.symbol) + const shouldDisplayVEP115Warning = + gene.reference_genome === 'GRCh38' && vep115Genes.includes(gene.symbol) + return ( <> {shouldDisplayCMRGWarning && ( @@ -35,6 +39,14 @@ const GeneFlags = ({ gene }: Props) => { ) callset to remedy this issue in the future.

)} + {shouldDisplayVEP115Warning && ( +

+ Warning MANE Select and variant consequence information in + this gene were annotated using Ensembl VEP version 115 (GENCODE v49). For more + information, see our{' '} + help page. +

+ )} {gene.flags.includes('chip') && (

Note Analysis of allele balance and age data indicates that diff --git a/browser/src/GenePage/GeneInfo.tsx b/browser/src/GenePage/GeneInfo.tsx index ccf461f22..a3bbaa35e 100644 --- a/browser/src/GenePage/GeneInfo.tsx +++ b/browser/src/GenePage/GeneInfo.tsx @@ -8,39 +8,40 @@ import Link from '../Link' import GeneReferences from './GeneReferences' type ManeSelectTranscriptIdProps = { - gene: { - mane_select_transcript: { - ensembl_id: string - ensembl_version: string - refseq_id: string - refseq_version: string - } - transcripts: { - transcript_id: string - transcript_version: string - }[] + mane_select_transcript: { + ensembl_id: string + ensembl_version: string + refseq_id: string + refseq_version: string } + transcripts: { + transcript_id: string + transcript_version: string + }[] } -const ManeSelectTranscriptId = ({ gene }: ManeSelectTranscriptIdProps) => { - const gencodeVersionOfManeSelectTransript = gene.transcripts.find( - (transcript: any) => transcript.transcript_id === gene.mane_select_transcript.ensembl_id +const ManeSelectTranscriptId = ({ + mane_select_transcript, + transcripts, +}: ManeSelectTranscriptIdProps) => { + const gencodeVersionOfManeSelectTranscript = transcripts.find( + (transcript) => transcript.transcript_id === mane_select_transcript.ensembl_id ) const shouldLinkToTranscriptPage = - gencodeVersionOfManeSelectTransript && - gencodeVersionOfManeSelectTransript.transcript_version === - gene.mane_select_transcript.ensembl_version + gencodeVersionOfManeSelectTranscript && + gencodeVersionOfManeSelectTranscript.transcript_version === + mane_select_transcript.ensembl_version return ( {shouldLinkToTranscriptPage ? ( - - {gene.mane_select_transcript.ensembl_id}.{gene.mane_select_transcript.ensembl_version} + + {mane_select_transcript.ensembl_id}.{mane_select_transcript.ensembl_version} ) : ( - `${gene.mane_select_transcript.ensembl_id}.${gene.mane_select_transcript.ensembl_version}` + `${mane_select_transcript.ensembl_id}.${mane_select_transcript.ensembl_version}` )}{' '} - / {gene.mane_select_transcript.refseq_id}.{gene.mane_select_transcript.refseq_version} + / {mane_select_transcript.refseq_id}.{mane_select_transcript.refseq_version} ) } @@ -109,8 +110,14 @@ const GeneInfo = ({ gene }: GeneInfoProps) => { } > - {/* @ts-expect-error TS(2322) FIXME: Type '{ gene_id: string; gene_version: string; sym... Remove this comment to see the full error message */} - {gene.mane_select_transcript ? : 'Not available'} + {gene.mane_select_transcript ? ( + + ) : ( + 'Not available' + )} )} diff --git a/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap b/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap index 72aeb4bab..4d1b9affe 100644 --- a/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap +++ b/browser/src/GenePage/__snapshots__/GeneFlags.spec.tsx.snap @@ -32,6 +32,27 @@ exports[`GeneFlags renders CMRG flag if one of 3 relevant genes 1`] = `

`; +exports[`GeneFlags renders VEP 115 warning for RNU4ATAC 1`] = ` +

+ + Warning + + MANE Select and variant consequence information in this gene were annotated using Ensembl VEP version 115 (GENCODE v49). For more information, see our + + + help page + + . +

+`; + exports[`GeneFlags renders chip flag if present on gene 1`] = `

0) + veps = annotate_transcript_consequences_in_table(veps, transcripts_data=transcripts_data) + + # We filter the data again here because annotate_transcript_consequences_in_table removes consequences with unimportant consequences terms + veps = veps.filter(veps.transcript_consequences.length() > 0) + veps = veps.annotate( + transcript_consequences=veps.transcript_consequences.map( + lambda tc: tc.annotate( + transcript_version="2", + gene_version="2", + is_mane_select=False, + is_mane_select_version=False, + refseq_id=hl.null(hl.tstr), + refseq_version=hl.null(hl.tstr), + ) + ) + ) + veps = veps.annotate( + transcript_consequences=veps.transcript_consequences.map( + lambda tc: tc.drop("polyphen_prediction", "sift_prediction") + ) + ) + + freqs = freqs.drop("transcript_consequences") + veps = veps.join(freqs) + + # Include just consequences and index fields + veps = veps.select(veps.variant_id, veps.rsids, veps.caid, veps.vrs, veps.transcript_consequences) + return veps diff --git a/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py b/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py index 1ec656b03..103d876ce 100644 --- a/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py +++ b/data-pipeline/src/data_pipeline/data_types/variant/transcript_consequence/annotate_transcript_consequences.py @@ -3,13 +3,18 @@ from .hgvs import hgvsp_from_consequence_amino_acids from .vep import consequence_term_rank - OMIT_CONSEQUENCE_TERMS = hl.set(["upstream_gene_variant", "downstream_gene_variant"]) +# ruff doesn't like explicit comparisons to None, but we need them in here, so: +# ruff: noqa: E711 + -def annotate_transcript_consequences(variants_path, transcripts_path, mane_transcripts_path=None): +def annotate_transcript_consequences(variants_path, transcripts_path=None, mane_transcripts_path=None): ds = hl.read_table(variants_path) + return annotate_transcript_consequences_in_table(ds, transcripts_path, mane_transcripts_path) + +def annotate_transcript_consequences_in_table(ds, transcripts_path=None, mane_transcripts_path=None): most_severe_consequence = ds.vep.most_severe_consequence transcript_consequences = ds.vep.transcript_consequences @@ -62,26 +67,25 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans transcript_consequences = transcript_consequences.map(lambda c: c.select(*consequences)) - transcripts = hl.read_table(transcripts_path) - - # TODO: This can potentially be improved by removing Table.collect - # See https://hail.zulipchat.com/#narrow/stream/123010-Hail-0.2E2.20support/topic/Optimize.20annotation.20with.20small.20dataset - # and https://github.com/Nealelab/ukb_common/blob/ad94d20f8c9f3b711e40a473425925775f0b1f30/utils/generic.py#L18 - transcript_info = hl.dict( - [ - (row.transcript_id, row.transcript_info) - for row in transcripts.select( - transcript_info=hl.struct( - transcript_version=transcripts.transcript_version, - gene_version=transcripts.gene.gene_version, - ) - ).collect() - ] - ) - - transcript_consequences = transcript_consequences.map( - lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id)) - ) + if transcripts_path != None: + transcripts = hl.read_table(transcripts_path) + # TODO: This can potentially be improved by removing Table.collect + # See https://hail.zulipchat.com/#narrow/stream/123010-Hail-0.2E2.20support/topic/Optimize.20annotation.20with.20small.20dataset + # and https://github.com/Nealelab/ukb_common/blob/ad94d20f8c9f3b711e40a473425925775f0b1f30/utils/generic.py#L18 + transcript_info = hl.dict( + [ + (row.transcript_id, row.transcript_info) + for row in transcripts.select( + transcript_info=hl.struct( + transcript_version=transcripts.transcript_version, + gene_version=transcripts.gene.gene_version, + ) + ).collect() + ] + ) + transcript_consequences = transcript_consequences.map( + lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id)) + ) if mane_transcripts_path: mane_transcripts = hl.read_table(mane_transcripts_path) diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py index bf074c45b..1446aa58e 100644 --- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py +++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py @@ -41,6 +41,9 @@ from data_pipeline.pipelines.gnomad_v4_cnvs import pipeline as gnomad_v4_cnvs_pipeline from data_pipeline.pipelines.gnomad_v4_lof_curation_results import pipeline as gnomad_v4_lof_curation_results_pipeline +from data_pipeline.pipelines.gene_patches import pipeline as gnomad_v4_gene_patches +from data_pipeline.pipelines.transcript_patches import pipeline as gnomad_v4_transcript_patches + logger = logging.getLogger("gnomad_data_pipeline") @@ -88,6 +91,15 @@ def add_liftover_document_id(ds): "block_size": 200, }, }, + "gene_patches": { + "get_table": lambda: hl.read_table(gnomad_v4_gene_patches.get_output("gene_patches").get_output_path()), + "args": { + "index": "genes_grch38_patches", + "index_fields": ["gene_id", "symbol_upper_case", "search_terms", "xstart", "xstop"], + "id_field": "gene_id", + "block_size": 200, + }, + }, ############################################################################################################## # Transcripts ############################################################################################################## @@ -109,6 +121,17 @@ def add_liftover_document_id(ds): "block_size": 1_000, }, }, + "transcripts_grch38_patched": { + "get_table": lambda: hl.read_table( + gnomad_v4_transcript_patches.get_output("transcripts_grch38_patched").get_output_path() + ), + "args": { + "index": "transcripts_grch38_patched", + "index_fields": ["transcript_id"], + "id_field": "transcript_id", + "block_size": 1_000, + }, + }, ############################################################################################################## # gnomAD v4 ############################################################################################################## @@ -133,6 +156,30 @@ def add_liftover_document_id(ds): "block_size": 1_000, }, }, + "gnomad_v4_variant_patches": { + "get_table": lambda: subset_table( + add_variant_document_id( + hl.read_table( + "gs://gnomad-browser-data-pipeline/phil-scratch/output/gnomad_v4/gnomad_v4_variants_patched.ht" + ) + ) + ), + "args": { + "index": "gnomad_v4_variants_patches", + "index_fields": [ + "document_id", + "variant_id", + "rsids", + "caid", + "locus", + "transcript_consequences.gene_id", + "transcript_consequences.transcript_id", + "vrs.alt.allele_id", + ], + "id_field": "document_id", + "block_size": 1_000, + }, + }, "gnomad_v4_exome_coverage": { "get_table": lambda: subset_table( hl.read_table(gnomad_v4_coverage_pipeline.get_output("exome_coverage").get_output_path()) diff --git a/data-pipeline/src/data_pipeline/pipelines/gene_patches.py b/data-pipeline/src/data_pipeline/pipelines/gene_patches.py new file mode 100644 index 000000000..022b18613 --- /dev/null +++ b/data-pipeline/src/data_pipeline/pipelines/gene_patches.py @@ -0,0 +1,17 @@ +from data_pipeline.pipeline import Pipeline, run_pipeline + +from data_pipeline.data_types.gene import patch_rnu4atac + +pipeline = Pipeline() + +pipeline.add_task( + "patch_rnu4atac_grch38", + patch_rnu4atac, + "/genes/genes_grch38_patched.ht", + {"genes_path": "gs://gnomad-v4-data-pipeline/output/genes/genes_grch38_annotated_6.ht"}, +) + +pipeline.set_outputs({"gene_patches": "patch_rnu4atac_grch38"}) + +if __name__ == "__main__": + run_pipeline(pipeline) diff --git a/data-pipeline/src/data_pipeline/pipelines/transcript_patches.py b/data-pipeline/src/data_pipeline/pipelines/transcript_patches.py new file mode 100644 index 000000000..fe332d898 --- /dev/null +++ b/data-pipeline/src/data_pipeline/pipelines/transcript_patches.py @@ -0,0 +1,28 @@ +from data_pipeline.pipeline import Pipeline, run_pipeline + +from data_pipeline.data_types.transcript import extract_transcripts +from data_pipeline.helpers import annotate_table + +pipeline = Pipeline() + +pipeline.add_task( + "extract_patched_transcripts", + extract_transcripts, + "/transcripts/transcripts_grch38_patched_base.ht", + {"genes_path": "gs://gnomad-browser-data-pipeline/phil-scratch/output/genes/genes_grch38_patched.ht"}, +) + +pipeline.add_task( + "annotate_patched_transcripts", + annotate_table, + "/transcripts/transcripts_grch38_annotated_1.ht", + { + "table_path": pipeline.get_task("extract_patched_transcripts"), + "gnomad_constraint": "gs://gnomad-v4-data-pipeline/output/constraint/gnomad_v4_constraint.ht", + }, +) + +pipeline.set_outputs({"transcripts_grch38_patched": "annotate_patched_transcripts"}) + +if __name__ == "__main__": + run_pipeline(pipeline) diff --git a/data-pipeline/src/data_pipeline/pipelines/variant_patches.py b/data-pipeline/src/data_pipeline/pipelines/variant_patches.py new file mode 100644 index 000000000..0762e6063 --- /dev/null +++ b/data-pipeline/src/data_pipeline/pipelines/variant_patches.py @@ -0,0 +1,20 @@ +from data_pipeline.pipeline import Pipeline, run_pipeline + +from data_pipeline.data_types.variant.patch_rnu4atac_variants import patch_rnu4atac_variants + +pipeline = Pipeline() + +pipeline.add_task( + "patch_rnu4atac_variants", + patch_rnu4atac_variants, + "/gnomad_v4/gnomad_v4_variants_patched.ht", + { + "vepped_path": "gs://gnomad-v4-data-pipeline/inputs/secondary-analyses/gnomad_v4.1.RNU4ATAC.vep115.ht", + "freq_path": "gs://gnomad-v4-data-pipeline/output/gnomad_v4/gnomad_v4_variants_annotated_4.ht", + }, +) + +pipeline.set_outputs({"variant_patches": "patch_rnu4atac_variants"}) + +if __name__ == "__main__": + run_pipeline(pipeline) diff --git a/graphql-api/src/elasticsearch.ts b/graphql-api/src/elasticsearch.ts index 7638d66c3..6ebd46380 100644 --- a/graphql-api/src/elasticsearch.ts +++ b/graphql-api/src/elasticsearch.ts @@ -82,8 +82,8 @@ const scheduleElasticsearchRequest = (fn: any) => { const limitedElastic = { indices: elastic.indices, clearScroll: elastic.clearScroll.bind(elastic), - search: (...args: Parameters) => - scheduleElasticsearchRequest(() => elastic.search(...args)).then((response) => { + search: (args: elasticsearch.RequestParams.Search) => + scheduleElasticsearchRequest(() => elastic.search(args)).then((response) => { // @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'. if (response.body.timed_out) { throw new Error('Elasticsearch search timed out') @@ -95,8 +95,8 @@ const limitedElastic = { } return response }), - scroll: (...args: Parameters) => - scheduleElasticsearchRequest(() => elastic.scroll(...args)).then((response) => { + scroll: (args: { scroll: string; scrollId?: string }) => + scheduleElasticsearchRequest(() => elastic.scroll(args)).then((response) => { // @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'. if (response.body.timed_out) { throw new Error('Elasticsearch scroll timed out') @@ -117,10 +117,22 @@ const limitedElastic = { } return response }), - get: (...args: Parameters) => - scheduleElasticsearchRequest(() => elastic.get(...args)), + get: (args: { index: string; type: '_doc'; id: string }) => + scheduleElasticsearchRequest(() => elastic.get(args)), mget: (...args: Parameters) => scheduleElasticsearchRequest(() => elastic.mget(...args)), } +export type LimitedElasticClient = typeof limitedElastic + +export type GetResponse = { + body: { _source: { value: Record } } +} + +export type SearchHit = { _id: string; _source: any } + +export type SearchResponse = { + body: { hits: { total: { value: number }; hits: SearchHit[] }; _scroll_id?: string } +} + export { limitedElastic as client } diff --git a/graphql-api/src/queries/gene-queries.ts b/graphql-api/src/queries/gene-queries.ts index ac7858c21..cc8b85a96 100644 --- a/graphql-api/src/queries/gene-queries.ts +++ b/graphql-api/src/queries/gene-queries.ts @@ -1,43 +1,62 @@ +import elasticsearch from '@elastic/elasticsearch' import { withCache } from '../cache' -import { fetchAllSearchResults } from './helpers/elasticsearch-helpers' +import { + fetchAllSearchResultsFromMultipleIndices, + getFromMultipleIndices, +} from './helpers/elasticsearch-helpers' -const GENE_INDICES = { - GRCh37: 'genes_grch37', - GRCh38: 'genes_grch38', -} +import { ReferenceGenome } from '@gnomad/dataset-metadata/metadata' +import { LimitedElasticClient, GetResponse, SearchResponse, SearchHit } from '../elasticsearch' -const _fetchGeneById = async (esClient: any, geneId: any, referenceGenome: any) => { - try { - const response = await esClient.get({ - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message - index: GENE_INDICES[referenceGenome], - type: '_doc', - id: geneId, - }) +type GeneIndex = 'genes_grch37' | 'genes_grch38' | 'genes_grch38_patches-2025-10-23--19-35' - return response.body._source.value - } catch (err) { - // meta will not be present if the request times out in the queue before reaching ES - // @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'. - if (err.meta && err.meta.body && err.meta.body.found === false) { - return null - } - throw err - } +type GeneSearchRegion = { reference_genome: ReferenceGenome; xstart: number; xstop: number } + +const GENE_INDICES: Record = { + // Order matters here: later indices take precedence over earlier + GRCh37: ['genes_grch37'], + GRCh38: ['genes_grch38', 'genes_grch38_patches-2025-10-23--19-35'], +} + +const _fetchGeneById = async ( + esClient: LimitedElasticClient, + geneId: string, + referenceGenome: ReferenceGenome +) => { + const indices = GENE_INDICES[referenceGenome] + const requests = indices.map( + (index) => + esClient + .get({ + index, + type: '_doc', + id: geneId, + }) + .catch((err) => { + // meta will not be present if the request times out in the queue before reaching ES + if (err.meta && err.meta.body && err.meta.body.found === false) { + return null + } + throw err + }) as Promise + ) + return getFromMultipleIndices(requests) } export const fetchGeneById = withCache( _fetchGeneById, - (_: any, geneId: any, referenceGenome: any) => `gene:${geneId}:${referenceGenome}`, + (_: any, geneId: string, referenceGenome: ReferenceGenome) => `gene:${geneId}:${referenceGenome}`, { expiration: 86400 } ) -export const fetchGeneBySymbol = async (esClient: any, geneSymbol: any, referenceGenome: any) => { - const response = await esClient.search({ - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message - index: GENE_INDICES[referenceGenome], - type: '_doc', +export const fetchGeneBySymbol = async ( + esClient: LimitedElasticClient, + geneSymbol: string, + referenceGenome: ReferenceGenome +) => { + const indices = GENE_INDICES[referenceGenome] + const responses = await searchMultipleIndices(esClient, indices, { body: { query: { bool: { @@ -48,20 +67,22 @@ export const fetchGeneBySymbol = async (esClient: any, geneSymbol: any, referenc size: 1, }) - if (response.body.hits.total.value === 0) { + const responsesWithValue = responses.filter((response) => response.body.hits.total.value > 0) + if (responsesWithValue.length === 0) { return null } - return response.body.hits.hits[0]._source.value + return responsesWithValue[responsesWithValue.length - 1].body.hits.hits[0]._source.value } -export const fetchGenesByRegion = async (esClient: any, region: any) => { - const { reference_genome: referenceGenome, xstart, xstop } = region +export const fetchGenesByRegion = async ( + esClient: LimitedElasticClient, + region: GeneSearchRegion +) => { + const { reference_genome, xstart, xstop } = region + const indices = GENE_INDICES[reference_genome] - const hits = await fetchAllSearchResults(esClient, { - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message - index: GENE_INDICES[referenceGenome], - type: '_doc', + const hits = await fetchAllSearchResultsFromMultipleIndices(esClient, indices, { size: 200, _source: [ 'value.exons', @@ -98,28 +119,76 @@ export const fetchGenesByRegion = async (esClient: any, region: any) => { }, }) - return hits.map((hit: any) => hit._source.value) + const mergedHits = mergeHitsById(hits.flat()) + return mergedHits.map((hit) => hit._source.value) +} + +const searchMultipleIndices = async ( + esClient: LimitedElasticClient, + indices: string[], + searchParams: elasticsearch.RequestParams.Search +): Promise => { + const requests = indices.map( + (index) => + esClient.search({ + index, + type: '_doc', + ...searchParams, + }) as Promise + ) + + return Promise.all(requests) +} + +const mergeHitsById = (hits: SearchHit[]): SearchHit[] => { + const ids: string[] = [] + const idsToHits: Record = {} + hits.forEach((hit) => { + if (idsToHits[hit._id] === undefined) { + ids.push(hit._id) + } + idsToHits[hit._id] = hit + }) + return ids.map((id) => idsToHits[id]) +} + +const mergeResponsesById = (responses: SearchResponse[]) => { + const ids: string[] = [] + const idsToDocs: Record = {} + responses.forEach((response) => + response.body.hits.hits.forEach((hit) => { + if (idsToDocs[hit._id] === undefined) { + ids.push(hit._id) + } + idsToDocs[hit._id] = hit._source + }) + ) + + return ids.map((id) => idsToDocs[id]) } -export const fetchGenesMatchingText = async (esClient: any, query: any, referenceGenome: any) => { +export const fetchGenesMatchingText = async ( + esClient: LimitedElasticClient, + query: string, + referenceGenome: ReferenceGenome +) => { const upperCaseQuery = query.toUpperCase() // Ensembl ID if (/^ENSG\d{11}$/.test(upperCaseQuery)) { const gene = await _fetchGeneById(esClient, upperCaseQuery, referenceGenome) - return [ - { - ensembl_id: gene.gene_id, - symbol: gene.symbol, - }, - ] + return ( + gene && [ + { + ensembl_id: gene.gene_id, + symbol: gene.symbol, + }, + ] + ) } // Symbol - const response = await esClient.search({ - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message - index: GENE_INDICES[referenceGenome], - type: '_doc', + const responses = await searchMultipleIndices(esClient, GENE_INDICES[referenceGenome], { _source: ['gene_id', 'value.gene_version', 'value.symbol'], body: { query: { @@ -134,15 +203,16 @@ export const fetchGenesMatchingText = async (esClient: any, query: any, referenc size: 5, }) - if (response.body.hits.total.value === 0) { + const responsesWithValue = responses.filter((response) => response.body.hits.total.value !== 0) + if (responsesWithValue.length === 0) { return [] } - return response.body.hits.hits - .map((hit: any) => hit._source) - .map((doc: any) => ({ - ensembl_id: doc.gene_id, - ensembl_version: doc.value.gene_version, - symbol: doc.value.symbol, - })) + const mergedDocs = mergeResponsesById(responsesWithValue) + + return mergedDocs.map((doc) => ({ + ensembl_id: doc.gene_id, + ensembl_version: doc.value.gene_version, + symbol: doc.value.symbol, + })) } diff --git a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts index 5ec797ddb..ab946c26c 100644 --- a/graphql-api/src/queries/helpers/elasticsearch-helpers.ts +++ b/graphql-api/src/queries/helpers/elasticsearch-helpers.ts @@ -1,49 +1,64 @@ +import elasticsearch from '@elastic/elasticsearch' +import { LimitedElasticClient, SearchResponse, SearchHit, GetResponse } from '../../elasticsearch' + /** * Search and then scroll to retrieve all pages of search results. * - * @param {elasticsearch.Client} client Elasticsearch client - * @param {Object} searchParams Argument to elasticsearch.Client#search - * @return {Object[]} Combined list of hits from all responses */ -export const fetchAllSearchResults = async (client: any, searchParams: any) => { - const allResults: any = [] - const responseQueue = [] +export const fetchAllSearchResults = async (client: LimitedElasticClient, searchParams: any) => { + const allResults: SearchHit[] = [] + const responseQueue: SearchResponse[] = [] const size = searchParams.size || 1000 const scroll = searchParams.scroll || '30s' responseQueue.push( - await client.search({ + await (client.search({ ...searchParams, scroll, size, - }) + }) as Promise) ) while (responseQueue.length) { - const response = responseQueue.shift() + const response = responseQueue.shift()! allResults.push(...response.body.hits.hits) if (allResults.length === response.body.hits.total.value) { // eslint-disable-next-line no-await-in-loop await client.clearScroll({ - scrollId: response.body._scroll_id, // eslint-disable-line no-underscore-dangle + scroll_id: response.body._scroll_id, // eslint-disable-line no-underscore-dangle }) break } responseQueue.push( // eslint-disable-next-line no-await-in-loop - await client.scroll({ + await (client.scroll({ scroll, scrollId: response.body._scroll_id, // eslint-disable-line no-underscore-dangle - }) + }) as Promise) ) } return allResults } +export const fetchAllSearchResultsFromMultipleIndices = async ( + esClient: LimitedElasticClient, + indices: string[], + searchParams: elasticsearch.RequestParams.Search +) => { + const requests = indices.map((index) => + fetchAllSearchResults(esClient, { + index, + type: '_doc', + ...searchParams, + }) + ) + return Promise.all(requests) +} + // Retrieve index metadata set by data pipeline export const fetchIndexMetadata = async (esClient: any, index: any) => { const response = await esClient.indices.getMapping({ @@ -54,3 +69,16 @@ export const fetchIndexMetadata = async (esClient: any, index: any) => { // eslint-disable-next-line no-underscore-dangle return Object.values(response.body)[0].mappings._meta } + +export const getFromMultipleIndices = (requests: Promise[]) => + Promise.all(requests).then( + (responses) => { + const responsesWithValue = responses.filter((response) => response !== null) + return responsesWithValue.length > 0 + ? responsesWithValue[responsesWithValue.length - 1]!.body._source.value + : null + }, + (err) => { + throw err + } + ) diff --git a/graphql-api/src/queries/transcript-queries.ts b/graphql-api/src/queries/transcript-queries.ts index e36b22969..e60ffd726 100644 --- a/graphql-api/src/queries/transcript-queries.ts +++ b/graphql-api/src/queries/transcript-queries.ts @@ -1,24 +1,39 @@ -const TRANSCRIPT_INDICES = { - GRCh37: 'transcripts_grch37', - GRCh38: 'transcripts_grch38', +import { ReferenceGenome } from '@gnomad/dataset-metadata/metadata' +import { GetResponse, LimitedElasticClient } from '../elasticsearch' +import { getFromMultipleIndices } from './helpers/elasticsearch-helpers' + +type TranscriptIndex = + | 'transcripts_grch37' + | 'transcripts_grch38' + | 'transcripts_grch38_patched-2025-10-23--19-36' + +const TRANSCRIPT_INDICES: Record = { + GRCh37: ['transcripts_grch37'], + GRCh38: ['transcripts_grch38', 'transcripts_grch38_patched-2025-10-23--19-36'], } -export const fetchTranscriptById = async (es: any, transcriptId: any, referenceGenome: any) => { - try { - const response = await es.get({ - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message - index: TRANSCRIPT_INDICES[referenceGenome], - type: '_doc', - id: transcriptId, - }) +export const fetchTranscriptById = async ( + esClient: LimitedElasticClient, + transcriptId: string, + referenceGenome: ReferenceGenome +) => { + const indices = TRANSCRIPT_INDICES[referenceGenome] + const requests = indices.map( + (index) => + esClient + .get({ + index, + type: '_doc', + id: transcriptId, + }) + .catch((err) => { + // meta will not be present if the request times out in the queue before reaching ES + if (err.meta && err.meta.body.found === false) { + return null + } + throw err + }) as Promise + ) - return response.body._source.value - } catch (err) { - // meta will not be present if the request times out in the queue before reaching ES - // @ts-expect-error TS(2571) FIXME: Object is of type 'unknown'. - if (err.meta && err.meta.body.found === false) { - return null - } - throw err - } + return getFromMultipleIndices(requests) } diff --git a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts index 46955fb37..5d96a23a4 100644 --- a/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts +++ b/graphql-api/src/queries/variant-datasets/gnomad-v4-variant-queries.ts @@ -5,7 +5,10 @@ import { isRsId } from '@gnomad/identifiers' import { UserVisibleError } from '../../errors' import { fetchLocalAncestryPopulationsByVariant } from '../local-ancestry-queries' -import { fetchAllSearchResults } from '../helpers/elasticsearch-helpers' +import { + fetchAllSearchResults, + fetchAllSearchResultsFromMultipleIndices, +} from '../helpers/elasticsearch-helpers' import { mergeOverlappingRegions } from '../helpers/region-helpers' import { fetchLofCurationResultsByVariant, @@ -16,10 +19,67 @@ import { import { getFlagsForContext } from './shared/flags' import { getConsequenceForContext } from './shared/transcriptConsequence' import largeGenes from '../helpers/large-genes' +import { LimitedElasticClient, SearchResponse } from '../../elasticsearch' const GNOMAD_V4_VARIANT_INDEX = 'gnomad_v4_variants' +const GNOMAD_V4_VARIANT_INDEX_PATCHES = 'gnomad_v4_variants_patches-2025-10-14--20-02' type Subset = 'all' | 'non_ukb' +type ESTranscriptConsequence = { + biotype: string + consequence_terms: string[] + gene_id: string + gene_symbol: string + gene_version: string + is_canonical: boolean + major_consequence: string + transcript_id: string + transcript_version: string +} +type ESPatch = { + variant_id: string + transcript_consequences: ESTranscriptConsequence[] +} + +const mergeTranscriptConsequences = ( + transcriptConsequences: ESTranscriptConsequence[], + patchedTranscriptConsequences?: ESTranscriptConsequence[] | null +) => { + if (!patchedTranscriptConsequences) { + return transcriptConsequences + } + + const result: ESTranscriptConsequence[] = [] + transcriptConsequences.forEach((csq) => { + const patchedConsequence = patchedTranscriptConsequences!.find( + (patchedCsq) => patchedCsq.transcript_id === csq.transcript_id + ) + result.push(patchedConsequence || csq) + }) + return result +} + +const mergeTranscriptConsequencesInVariant = ( + variant: { variant_id: string; transcript_consequences: ESTranscriptConsequence[] }, + patches: ESPatch[] +) => { + const matchingPatch = patches.find((patch) => patch.variant_id === variant.variant_id) + if (matchingPatch === undefined) { + return variant + } + + return { + ...variant, + transcript_consequences: mergeTranscriptConsequences( + variant.transcript_consequences, + matchingPatch.transcript_consequences + ), + } +} + +const hasPositiveAC = (variant: any, subset: string) => + (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) || + variant.exome.freq[subset].ac_raw > 0 // ================================================================================================ // Count query @@ -69,30 +129,50 @@ const chooseIdField = (variantId: string) => { return 'variant_id' } -const fetchVariantById = async (esClient: any, variantId: any, subset: Subset) => { +const fetchVariantById = async ( + esClient: LimitedElasticClient, + variantId: string, + subset: Subset +) => { const idField = chooseIdField(variantId) - const response = await esClient.search({ + const query = { + bool: { + filter: { term: { [idField]: variantId } }, + }, + } + + const variantResponsePromise = esClient.search({ index: GNOMAD_V4_VARIANT_INDEX, body: { - query: { - bool: { - filter: { term: { [idField]: variantId } }, - }, - }, + query, }, size: 1, - }) + }) as Promise + const patchResponsePromise = esClient.search({ + index: GNOMAD_V4_VARIANT_INDEX_PATCHES, + body: { query }, + size: 1, + }) as Promise - if (response.body.hits.total.value === 0) { + const variantResponse = await variantResponsePromise + + if (variantResponse.body.hits.total.value === 0) { throw new UserVisibleError('Variant not found') } // An rsID may match multiple variants - if (response.body.hits.total.value > 1) { + if (variantResponse.body.hits.total.value > 1) { throw new UserVisibleError('Multiple variants found, query using variant ID to select one.') } - const variant = response.body.hits.hits[0]._source.value + const patchResponse = await patchResponsePromise + const patchedTranscriptConsequences = + patchResponse.body.hits.total.value > 0 + ? (patchResponse.body.hits.hits[0]._source.value + .transcript_consequences as ESTranscriptConsequence[]) + : null + + const variant = variantResponse.body.hits.hits[0]._source.value const subsetGenomeFreq = variant.genome.freq.all || {} const subsetJointFreq = variant.joint.freq[subset] || {} @@ -244,9 +324,10 @@ const fetchVariantById = async (esClient: any, variantId: any, subset: Subset) = flags: variantFlags, // TODO: Include RefSeq transcripts once the browser supports them. lof_curations: lofCurationResults, - transcript_consequences: (variant.transcript_consequences || []).filter((csq: any) => - csq.gene_id.startsWith('ENSG') - ), + transcript_consequences: mergeTranscriptConsequences( + variant.transcript_consequences, + patchedTranscriptConsequences + ).filter((csq: any) => csq.gene_id.startsWith('ENSG')), in_silico_predictors: inSilicoPredictorsList, } @@ -454,28 +535,30 @@ const fetchVariantsByGene = async (esClient: any, gene: any, subset: Subset) => }, })) - const hits = await fetchAllSearchResults(esClient, { - index: GNOMAD_V4_VARIANT_INDEX, - type: '_doc', - size: pageSize, - _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset), - body: { - query: { - bool: { - filter: [{ term: { gene_id: gene.gene_id } }, { bool: { should: rangeQueries } }], + const [hits, consequencePatchHits] = await fetchAllSearchResultsFromMultipleIndices( + esClient, + [GNOMAD_V4_VARIANT_INDEX, GNOMAD_V4_VARIANT_INDEX_PATCHES], + { + type: '_doc', + size: pageSize, + _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset), + body: { + query: { + bool: { + filter: [{ term: { gene_id: gene.gene_id } }, { bool: { should: rangeQueries } }], + }, }, + sort: [{ 'locus.position': { order: 'asc' } }], }, - sort: [{ 'locus.position': { order: 'asc' } }], - }, - }) + } + ) + + const consequencePatches: ESPatch[] = consequencePatchHits.map((hit) => hit._source.value) const shapedHits = hits .map((hit: any) => hit._source.value) - .filter( - (variant: any) => - (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) || - variant.exome.freq[subset].ac_raw > 0 - ) + .filter((variant) => hasPositiveAC(variant, subset)) + .map((variant) => mergeTranscriptConsequencesInVariant(variant, consequencePatches)) .map(shapeVariantSummary(subset, { type: 'gene', geneId: gene.gene_id })) const lofCurationResults = await fetchLofCurationResultsByGene(esClient, 'v4', gene) @@ -507,38 +590,40 @@ const fetchVariantsByRegion = async (esClient: any, region: any, subset: Subset) const genomeSubset = 'all' const jointSubset = 'all' - const hits = await fetchAllSearchResults(esClient, { - index: GNOMAD_V4_VARIANT_INDEX, - type: '_doc', - size: 10000, - _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset), - body: { - query: { - bool: { - filter: [ - { term: { 'locus.contig': `chr${region.chrom}` } }, - { - range: { - 'locus.position': { - gte: region.start, - lte: region.stop, + const [hits, consequencePatchHits] = await fetchAllSearchResultsFromMultipleIndices( + esClient, + [GNOMAD_V4_VARIANT_INDEX, GNOMAD_V4_VARIANT_INDEX_PATCHES], + { + type: '_doc', + size: 10000, + _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset), + body: { + query: { + bool: { + filter: [ + { term: { 'locus.contig': `chr${region.chrom}` } }, + { + range: { + 'locus.position': { + gte: region.start, + lte: region.stop, + }, }, }, - }, - ], + ], + }, }, + sort: [{ 'locus.position': { order: 'asc' } }], }, - sort: [{ 'locus.position': { order: 'asc' } }], - }, - }) + } + ) + + const consequencePatches: ESPatch[] = consequencePatchHits.map((hit) => hit._source.value) const variants = hits .map((hit: any) => hit._source.value) - .filter( - (variant: any) => - (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) || - variant.exome.freq[subset].ac_raw > 0 - ) + .filter((variant) => hasPositiveAC(variant, subset)) + .map((variant) => mergeTranscriptConsequencesInVariant(variant, consequencePatches)) .map(shapeVariantSummary(subset, { type: 'region' })) const lofCurationResults = await fetchLofCurationResultsByRegion(esClient, 'v4', region) @@ -599,31 +684,33 @@ const fetchVariantsByTranscript = async (esClient: any, transcript: any, subset: }, })) - const hits = await fetchAllSearchResults(esClient, { - index: GNOMAD_V4_VARIANT_INDEX, - type: '_doc', - size: 10000, - _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset), - body: { - query: { - bool: { - filter: [ - { term: { transcript_id: transcript.transcript_id } }, - { bool: { should: rangeQueries } }, - ], + const [hits, consequencePatchHits] = await fetchAllSearchResultsFromMultipleIndices( + esClient, + [GNOMAD_V4_VARIANT_INDEX, GNOMAD_V4_VARIANT_INDEX_PATCHES], + { + type: '_doc', + size: 10000, + _source: getMultiVariantSourceFields(exomeSubset, genomeSubset, jointSubset), + body: { + query: { + bool: { + filter: [ + { term: { transcript_id: transcript.transcript_id } }, + { bool: { should: rangeQueries } }, + ], + }, }, + sort: [{ 'locus.position': { order: 'asc' } }], }, - sort: [{ 'locus.position': { order: 'asc' } }], - }, - }) + } + ) + + const consequencePatches: ESPatch[] = consequencePatchHits.map((hit) => hit._source.value) return hits .map((hit: any) => hit._source.value) - .filter( - (variant: any) => - (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) || - variant.exome.freq[subset].ac_raw > 0 - ) + .filter((variant) => hasPositiveAC(variant, subset)) + .map((variant) => mergeTranscriptConsequencesInVariant(variant, consequencePatches)) .map( shapeVariantSummary(subset, { type: 'transcript', transcriptId: transcript.transcript_id }) ) @@ -665,11 +752,7 @@ const fetchMatchingVariants = async ( return hits .map((hit: any) => hit._source.value) - .filter( - (variant: any) => - (variant.genome.freq.all && variant.genome.freq.all.ac_raw > 0) || - variant.exome.freq[subset].ac_raw > 0 - ) + .filter((variant) => hasPositiveAC(variant, subset)) .map((variant: any) => ({ variant_id: variant.variant_id, }))