diff --git a/examples/input_examples/search_dna_demo.jsonl b/examples/input_examples/search_dna_demo.jsonl index f423e1c1..ad5ac101 100644 --- a/examples/input_examples/search_dna_demo.jsonl +++ b/examples/input_examples/search_dna_demo.jsonl @@ -1,4 +1,4 @@ -{"type": "text", "content": "NG_033923"} -{"type": "text", "content": "NG_056118"} -{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} -{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"type": "dna", "content": "NG_033923"} +{"type": "dna", "content": "NG_056118"} +{"type": "dna", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"type": "dna", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} diff --git a/examples/input_examples/search_protein_demo.jsonl b/examples/input_examples/search_protein_demo.jsonl index e119cec8..f588c6d7 100644 --- a/examples/input_examples/search_protein_demo.jsonl +++ b/examples/input_examples/search_protein_demo.jsonl @@ -1,14 +1,14 @@ -{"type": "text", "content": "P01308"} -{"type": "text", "content": "P68871"} -{"type": "text", "content": "P02768"} -{"type": "text", "content": "P04637"} -{"type": "text", "content": "insulin"} -{"type": "text", "content": "hemoglobin"} -{"type": "text", "content": "p53"} -{"type": "text", "content": "BRCA1"} -{"type": "text", "content": "albumin"} -{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} -{"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} -{"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} -{"type": "text", "content": "MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKTTVIALREIEEGLINNQILDVRERQEQQEQEAAELQAVTAIAEGRR"} -{"type": "text", "content": "GSHMLCAISGKVPRRPVLSPKSRTIFEKSLLEQYVKDTGNDPITNEPLSIEEIVEIVPSAQ"} +{"type": "protein", "content": "P01308"} +{"type": "protein", "content": "P68871"} +{"type": "protein", "content": "P02768"} +{"type": "protein", "content": "P04637"} +{"type": "protein", "content": "insulin"} +{"type": "protein", "content": "hemoglobin"} +{"type": "protein", "content": "p53"} +{"type": "protein", "content": "BRCA1"} +{"type": "protein", "content": "albumin"} +{"type": "protein", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"type": "protein", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"type": "protein", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"type": "protein", "content": "MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKTTVIALREIEEGLINNQILDVRERQEQQEQEAAELQAVTAIAEGRR"} +{"type": "protein", "content": "GSHMLCAISGKVPRRPVLSPKSRTIFEKSLLEQYVKDTGNDPITNEPLSIEEIVEIVPSAQ"} diff --git a/examples/input_examples/search_rna_demo.jsonl b/examples/input_examples/search_rna_demo.jsonl index 896473e2..f27258e4 100644 --- a/examples/input_examples/search_rna_demo.jsonl +++ b/examples/input_examples/search_rna_demo.jsonl @@ -1,8 +1,8 @@ -{"type": "text", "content": "hsa-let-7a-1"} -{"type": "text", "content": "XIST regulator"} -{"type": "text", "content": "URS0000123456"} -{"type": "text", "content": "URS0000000001"} -{"type": "text", "content": "URS0000000787"} -{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} -{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} -{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"type": "rna", "content": "hsa-let-7a-1"} +{"type": "rna", "content": "XIST regulator"} +{"type": "rna", "content": "URS0000123456"} +{"type": "rna", "content": "URS0000000001"} +{"type": "rna", "content": "URS0000000787"} +{"type": "rna", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} +{"type": "rna", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"type": "rna", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} diff --git a/examples/input_examples/searched_dna_demo.jsonl b/examples/input_examples/searched_dna_demo.jsonl new file mode 100644 index 00000000..05778743 --- /dev/null +++ b/examples/input_examples/searched_dna_demo.jsonl @@ -0,0 +1,3 @@ +{"_doc_id":"doc-NG_011079","type":"dna","content":"Title: Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1\nSequence: ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_011079","gene_name":"RPL35AP6","gene_description":"ribosomal protein L35a pseudogene 6","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_011079","gene_synonyms":["RPL35A_3_191"],"gene_type":"other","chromosome":"1","genomic_location":"1-522","function":null,"title":"Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1","sequence":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","sequence_length":522,"gene_id":"100271312","molecule_type_detail":"genomic region","_search_query":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"_doc_id":"doc-NG_033923","type":"dna","content":"Title: Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2\nSequence: GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_033923","gene_name":"IRGMP","gene_description":"immunity-related GTPase family, M, pseudogene","organism":"Callithrix jacchus","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_033923","gene_synonyms":null,"gene_type":"other","chromosome":"2","genomic_location":"1-830","function":null,"title":"Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2","sequence":"GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","sequence_length":830,"gene_id":"100409682","molecule_type_detail":"genomic region","_search_query":"NG_033923"} +{"_doc_id":"doc-NG_056118","type":"dna","content":"Title: Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6\nSequence: GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_056118","gene_name":"HLA-DRB8","gene_description":"major histocompatibility complex, class II, DR beta 8 (pseudogene)","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_056118","gene_synonyms":null,"gene_type":"other","chromosome":"6","genomic_location":"1-2737","function":null,"title":"Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6","sequence":"GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","sequence_length":2737,"gene_id":"3130","molecule_type_detail":"genomic region","_search_query":"NG_056118"} diff --git a/examples/input_examples/searched_protein_demo.jsonl b/examples/input_examples/searched_protein_demo.jsonl new file mode 100644 index 00000000..47ab02ad --- /dev/null +++ b/examples/input_examples/searched_protein_demo.jsonl @@ -0,0 +1,8 @@ +{"_doc_id":"doc-P01308","type":"protein","content":"Function: ['Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver.']\nSequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P01308","entry_name":"INS_HUMAN","gene_names":[{"Name":"INS"}],"protein_name":"Insulin","organism":"Homo sapiens","sequence":"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","function":["Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver."],"url":"https:\/\/www.uniprot.org\/uniprot\/P01308","_search_query":"P01308"} +{"_doc_id":"doc-Q6UWZ7","type":"protein","content":"Function: [\"Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}.\"]\nSequence: MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q6UWZ7","entry_name":"ABRX1_HUMAN","gene_names":[{"Name":"ABRAXAS1 {ECO:0000312|HGNC:HGNC:25829}","Synonyms":["ABRA1 {ECO:0000312|HGNC:HGNC:25829}","CCDC98","FAM175A {ECO:0000312|HGNC:HGNC:25829}"],"ORFNames":["UNQ496\/PRO1013"]}],"protein_name":"BRCA1-A complex subunit Abraxas 1 {ECO:0000312|HGNC:HGNC:25829}","organism":"Homo sapiens","sequence":"MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","function":["Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q6UWZ7","_search_query":"BRCA1"} +{"_doc_id":"doc-P27355","type":"protein","content":"Function: ['Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds.']\nSequence: MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P27355","entry_name":"MEMG_METTR","gene_names":[{"Name":"mmoZ"}],"protein_name":"Methane monooxygenase component A gamma chain","organism":"Methylosinus trichosporium.","sequence":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","function":["Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds."],"url":"https:\/\/www.uniprot.org\/uniprot\/P27355","_search_query":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"_doc_id":"doc-Q96GG9","type":"protein","content":"Function: ['Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}.']\nSequence: MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q96GG9","entry_name":"DCNL1_HUMAN","gene_names":[{"Name":"DCUN1D1 {ECO:0000312|HGNC:HGNC:18184}","Synonyms":["DCN1 {ECO:0000303|PubMed:28581483}","DCUN1L1","RP42","SCCRO"]}],"protein_name":"DCN1-like protein 1 {ECO:0000305}","organism":"Homo sapiens","sequence":"MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","function":["Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q96GG9","_search_query":"MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"_doc_id":"doc-P68871","type":"protein","content":"Function: ['Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.', 'LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.', '[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation.']\nSequence: MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P68871","entry_name":"HBB_HUMAN","gene_names":[{"Name":"HBB"}],"protein_name":"Hemoglobin subunit beta","organism":"Homo sapiens","sequence":"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","function":["Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.","LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.","[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation."],"url":"https:\/\/www.uniprot.org\/uniprot\/P68871","_search_query":"P68871"} +{"_doc_id":"doc-P22939","type":"protein","content":"Sequence: MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P22939","entry_name":"ISPA_ECOLI","gene_names":[{"Name":"ispA","OrderedLocusNames":["b0421","JW0411"]}],"protein_name":"Farnesyl diphosphate synthase","organism":"Escherichia coli","sequence":"MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","function":[],"url":"https:\/\/www.uniprot.org\/uniprot\/P22939","_search_query":"MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"_doc_id":"doc-Q8I8V0","type":"protein","content":"Function: ['Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.', '[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.', '[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.']\nSequence: MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q8I8V0","entry_name":"TAD2B_DROME","gene_names":[{"Name":"Ada2b {ECO:0000312|FlyBase:FBgn0037555}","Synonyms":["Ada2S {ECO:0000303|PubMed:12697829}"],"ORFNames":["CG9638 {ECO:0000312|FlyBase:FBgn0037555}"]}],"protein_name":"Transcriptional adapter 2b {ECO:0000312|FlyBase:FBgn0037555}","organism":"Drosophila melanogaster","sequence":"MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","function":["Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.","[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.","[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q8I8V0","_search_query":"p53"} +{"_doc_id":"doc-P04637","type":"protein","content":"Function: ['Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}.']\nSequence: MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P04637","entry_name":"P53_HUMAN","gene_names":[{"Name":"TP53","Synonyms":["P53"]}],"protein_name":"Cellular tumor antigen p53","organism":"Homo sapiens","sequence":"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","function":["Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}."],"url":"https:\/\/www.uniprot.org\/uniprot\/P04637","_search_query":"P04637"} diff --git a/examples/input_examples/searched_rna_demo.jsonl b/examples/input_examples/searched_rna_demo.jsonl new file mode 100644 index 00000000..9ad088c0 --- /dev/null +++ b/examples/input_examples/searched_rna_demo.jsonl @@ -0,0 +1,6 @@ +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000123456"} +{"_doc_id":"doc-URS00000088CC","type":"rna","content":"Description: lncRNA from 1 species\nSequence: GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS00000088CC","rnacentral_id":"URS00000088CC","sequence":"GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","sequence_length":210,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS00000088CC","organism":"Homo sapiens","related_genes":["ENSG00000265458.1","lnc-C17orf62-1","ENSG00000265458","NONHSAG023099","HSALNG0119438","NONHSAG023099.2","ENSG00000265458.4","RP13-20L14.6","NARF-AS2"],"gene_name":"ENSG00000265458, ENSG00000265458.1, ENSG00000265458.4, HSALNG0119438, NARF-AS2, NONHSAG023099, NONHSAG023099.2, RP13-20L14.6, lnc-C17orf62-1","so_term":"antisense, ncRNA","modifications":null,"_search_query":"GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} +{"_doc_id":"doc-URS000342178E","type":"rna","content":"Description: None misc RNA\nSequence: GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS000342178E","rnacentral_id":"URS000342178E","sequence":"GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","sequence_length":73,"rna_type":"misc_RNA","description":"None misc RNA","url":"https:\/\/rnacentral.org\/rna\/URS000342178E","organism":null,"related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"XIST regulator"} +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"_doc_id":"doc-URS0000000787","type":"rna","content":"Description: lncRNA from 1 species\nSequence: AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000787","rnacentral_id":"URS0000000787","sequence":"AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","sequence_length":576,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000787","organism":"Homo sapiens","related_genes":["KB-1183D5.13","lnc-GGT2-26","ENSG00000206142.10","ENSG00000206142.9","NONHSAG033362.2","FAM230H","NONHSAG033362","lnc-GGT2-4","ENSG00000206142","lnc-GGT2-2","HSALNG0134219"],"gene_name":"ENSG00000206142, ENSG00000206142.10, ENSG00000206142.9, FAM230H, HSALNG0134219, KB-1183D5.13, NONHSAG033362, NONHSAG033362.2, lnc-GGT2-2, lnc-GGT2-26, lnc-GGT2-4","so_term":"lincRNA, ncRNA","modifications":null,"_search_query":"URS0000000787"} +{"_doc_id":"doc-URS0000000001","type":"rna","content":"Description: rRNA from 1 species\nSequence: AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000001","rnacentral_id":"URS0000000001","sequence":"AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","sequence_length":200,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000001","organism":"uncultured bacterium","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000000001"} diff --git a/examples/search/build_db/build_protein_blast_db.sh b/examples/search/build_db/build_protein_blast_db.sh deleted file mode 100755 index 9292875a..00000000 --- a/examples/search/build_db/build_protein_blast_db.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -set -e - -# Downloads the latest release of UniProt, putting it in a release-specific directory. -# Creates associated BLAST databases. -# We need makeblastdb on our PATH -# For Ubuntu/Debian: sudo apt install ncbi-blast+ -# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ -# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ - -# Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading -mkdir -p ${DOWNLOAD_TMP} -cd ${DOWNLOAD_TMP} - -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink" - -# Extract the release name (like 2017_10 or 2017_1) -# Use sed for cross-platform compatibility (works on both macOS and Linux) -RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) - -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE" - -cd .. - -mkdir ${RELEASE} -mv ${DOWNLOAD_TMP}/* ${RELEASE} -rmdir ${DOWNLOAD_TMP} - -cd ${RELEASE} - -gunzip uniprot_sprot.fasta.gz -gunzip uniprot_trembl.fasta.gz - -cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta - -makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} -makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot -makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl - -cd .. - -echo "BLAST databases created successfully!" -echo "Database locations:" -echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" -echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" -echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" -echo "" -echo "To use these databases, set in your config:" -echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" - diff --git a/examples/search/build_db/build_rna_blast_db.sh b/examples/search/build_db/build_rna_blast_db.sh deleted file mode 100755 index 26e1cd33..00000000 --- a/examples/search/build_db/build_rna_blast_db.sh +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/bash - -set -e - -# Downloads RNAcentral sequences and creates BLAST databases. -# This script downloads the RNAcentral active database, which is the same -# data source used for online RNAcentral searches, ensuring consistency -# between local and online search results. -# -# RNAcentral is a comprehensive database of non-coding RNA sequences that -# integrates data from multiple expert databases including RefSeq, Rfam, etc. -# -# Usage: ./build_rna_blast_db.sh [all|list|database_name] -# all (default): Download complete active database (~8.4G compressed) -# list: List all available database subsets -# database_name: Download specific database subset (e.g., refseq, rfam, mirbase) -# -# Available database subsets (examples): -# - refseq.fasta (~98M): RefSeq RNA sequences -# - rfam.fasta (~1.5G): Rfam RNA families -# - mirbase.fasta (~10M): microRNA sequences -# - ensembl.fasta (~2.9G): Ensembl annotations -# - See "list" option for complete list -# -# The complete "active" database contains all sequences from all expert databases. -# Using a specific database subset provides a smaller, focused database. -# -# We need makeblastdb on our PATH -# For Ubuntu/Debian: sudo apt install ncbi-blast+ -# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ -# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ - -# RNAcentral HTTP base URL (using HTTPS for better reliability) -RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" -RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" -RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" -RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" - -# Parse command line argument -DB_SELECTION=${1:-all} - -# List available databases if requested -if [ "${DB_SELECTION}" = "list" ]; then - echo "Available RNAcentral database subsets:" - echo "" - echo "Fetching list from RNAcentral FTP..." - listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") - echo "${listing}" | \ - grep -oE '' | \ - sed 's///' | \ - sort | \ - while read db; do - size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown") - echo " - ${db%.fasta}: ${size}" - done - echo "" - echo "Usage: $0 [database_name]" - echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" - echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" - echo " Example: $0 all # Download complete active database (~8.4G)" - exit 0 -fi - -# Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading_rnacentral -mkdir -p ${DOWNLOAD_TMP} -cd ${DOWNLOAD_TMP} - -# Get RNAcentral release version from release notes -echo "Getting RNAcentral release information..." -RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" -RELEASE_NOTES="release_notes.txt" -wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || { - echo "Warning: Could not download release notes, using current date as release identifier" - RELEASE=$(date +%Y%m%d) -} - -if [ -f "${RELEASE_NOTES}" ]; then - # Try to extract version from release notes (first line usually contains version info) - RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') -fi - -if [ -z "${RELEASE}" ]; then - RELEASE=$(date +%Y%m%d) - echo "Using date as release identifier: ${RELEASE}" -else - echo "RNAcentral release: ${RELEASE}" -fi - -# Download RNAcentral FASTA file -if [ "${DB_SELECTION}" = "all" ]; then - # Download complete active database - FASTA_FILE="rnacentral_active.fasta.gz" - DB_NAME="rnacentral" - echo "Downloading RNAcentral active sequences (~8.4G)..." - echo " Contains sequences currently present in at least one expert database" - echo " Uses standard URS IDs (e.g., URS000149A9AF)" - echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" - FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" - IS_COMPRESSED=true -else - # Download specific database subset - DB_NAME="${DB_SELECTION}" - FASTA_FILE="${DB_SELECTION}.fasta" - echo "Downloading RNAcentral database subset: ${DB_SELECTION}" - echo " This is a subset of the active database from a specific expert database" - echo " File: ${FASTA_FILE}" - FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" - IS_COMPRESSED=false - - # Check if database exists - if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then - echo "Error: Database '${DB_SELECTION}' not found" - echo "Run '$0 list' to see available databases" - exit 1 - fi -fi - -echo "Downloading from: ${FASTA_URL}" -echo "This may take a while depending on your internet connection..." -if [ "${DB_SELECTION}" = "all" ]; then - echo "File size is approximately 8-9GB, please be patient..." -else - echo "Downloading database subset..." -fi -wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || { - echo "Error: Failed to download RNAcentral FASTA file" - echo "Please check your internet connection and try again" - echo "You can also try downloading manually from: ${FASTA_URL}" - exit 1 -} - -if [ ! -f "${FASTA_FILE}" ]; then - echo "Error: Downloaded file not found" - exit 1 -fi - -cd .. - -# Create release directory -if [ "${DB_SELECTION}" = "all" ]; then - OUTPUT_DIR="rnacentral_${RELEASE}" -else - OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" -fi -mkdir -p ${OUTPUT_DIR} -mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true -rmdir ${DOWNLOAD_TMP} 2>/dev/null || true - -cd ${OUTPUT_DIR} - -# Extract FASTA file if compressed -echo "Preparing RNAcentral sequences..." -if [ -f "${FASTA_FILE}" ]; then - if [ "${IS_COMPRESSED}" = "true" ]; then - echo "Decompressing ${FASTA_FILE}..." - OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" - gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { - echo "Error: Failed to decompress FASTA file" - exit 1 - } - # Optionally remove the compressed file to save space - # rm "${FASTA_FILE}" - else - # File is not compressed, just copy/rename - OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" - cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { - echo "Error: Failed to copy FASTA file" - exit 1 - } - fi -else - echo "Error: FASTA file not found" - exit 1 -fi - -# Check if we have sequences -if [ ! -s "${OUTPUT_FASTA}" ]; then - echo "Error: FASTA file is empty" - exit 1 -fi - -# Get file size for user information -FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) -echo "FASTA file size: ${FILE_SIZE}" - -echo "Creating BLAST database..." -# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) -# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, -# which matches the format expected by the RNACentralSearch class -DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" -makeblastdb -in "${OUTPUT_FASTA}" \ - -out "${DB_OUTPUT_NAME}" \ - -dbtype nucl \ - -parse_seqids \ - -title "RNAcentral_${DB_NAME}_${RELEASE}" - -echo "" -echo "BLAST database created successfully!" -echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" -echo "" -echo "To use this database, set in your config (search_rna_config.yaml):" -echo " rnacentral_params:" -echo " use_local_blast: true" -echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" -echo "" -echo "Note: The database files are:" -ls -lh ${DB_OUTPUT_NAME}.* | head -5 -echo "" -if [ "${DB_SELECTION}" = "all" ]; then - echo "This database uses RNAcentral IDs (URS...), which matches the online" - echo "RNAcentral search API, ensuring consistent results between local and online searches." -else - echo "This is a subset database from ${DB_SELECTION} expert database." - echo "For full coverage matching online API, use 'all' option." -fi - -cd .. - diff --git a/examples/search/search_dna.sh b/examples/search/search_dna.sh deleted file mode 100644 index d3c0d6ec..00000000 --- a/examples/search/search_dna.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file graphgen/configs/search_dna_config.yaml diff --git a/examples/search/search_dna/README.md b/examples/search/search_dna/README.md new file mode 100644 index 00000000..0cc8ebd1 --- /dev/null +++ b/examples/search/search_dna/README.md @@ -0,0 +1,84 @@ +# Search DNA Sequences + +This example demonstrates how to search DNA sequences from NCBI RefSeq database using BLAST. + +## Overview + +The DNA search pipeline reads DNA sequence queries and searches against NCBI RefSeq database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +``` + +Options: +- `human_mouse_drosophila_yeast`: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) +- `representative`: Download genomic sequences from major categories (recommended, smaller) +- `complete`: Download all complete genomic sequences from complete/ directory (very large) +- `all`: Download all genomic sequences from all categories (very large) + +The script will create a BLAST database in `refseq_${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_dna_config.yaml` to set: + +- **Input file path**: Set the path to your DNA sequence queries +- **NCBI parameters**: + - `email`: Your email address (required by NCBI) + - `tool`: Tool name for NCBI API + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (without .nhr extension) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_dna_demo.jsonl + +data_sources: [ncbi] +ncbi_params: + email: your_email@example.com # Required! + tool: GraphGen + use_local_blast: true + local_blast_db: refseq_release/refseq_release +``` + +### 3. Run the Search + +```bash +./search_dna.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_dna/search_dna_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with DNA sequence queries: + +```jsonl +{"type": "dna", "content": "BRCA1"} +{"type": "dna", "content": ">query\nATGCGATCG..."} +{"type": "dna", "content": "ATGCGATCG..."} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from NCBI RefSeq. + +## Notes + +- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params` +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- The local BLAST database can be very large (several GB to TB depending on the download type) +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/build_db/build_dna_blast_db.sh b/examples/search/search_dna/build_db.sh similarity index 54% rename from examples/search/build_db/build_dna_blast_db.sh rename to examples/search/search_dna/build_db.sh index 1928d7d0..8c281b40 100755 --- a/examples/search/build_db/build_dna_blast_db.sh +++ b/examples/search/search_dna/build_db.sh @@ -24,8 +24,8 @@ set -e # - {category}.{number}.genomic.fna.gz (基因组序列) # - {category}.{number}.rna.fna.gz (RNA序列) # -# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all] -# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest) +# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +# human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) # representative: Download genomic sequences from major categories (recommended, smaller) # Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi # complete: Download all complete genomic sequences from complete/ directory (very large) @@ -36,7 +36,7 @@ set -e # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -DOWNLOAD_TYPE=${1:-human_mouse} +DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast} # Better to use a stable DOWNLOAD_TMP name to support resuming downloads DOWNLOAD_TMP=_downloading_dna @@ -58,17 +58,49 @@ else echo "Using date as release identifier: ${RELEASE}" fi -# Function to check if a file contains target species +# Function to check if a file is already downloaded (compressed or decompressed) +check_file_downloaded() { + local filename=$1 + local decompressed_file="${filename%.gz}" + # Check if compressed or decompressed version exists + [ -f "${filename}" ] || [ -f "${decompressed_file}" ] +} + +# Function to check if a file contains target species sequences check_file_for_species() { local url=$1 local filename=$2 local temp_file="/tmp/check_${filename//\//_}" + # First check if file is already downloaded locally + if check_file_downloaded "${filename}"; then + # File already exists, check if it contains target species + # Check both compressed and decompressed versions + local decompressed_file="${filename%.gz}" + if [ -f "${filename}" ]; then + # Compressed file exists + if gunzip -c "${filename}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + elif [ -f "${decompressed_file}" ]; then + # Decompressed file exists + if head -2000 "${decompressed_file}" 2>/dev/null | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + fi + fi + + # File not downloaded yet, download first 500KB to check # Download first 500KB (enough to get many sequence headers) # This should be sufficient to identify the species in most cases if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then # Try to decompress and check for species names - if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then + # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母) + if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then rm -f "${temp_file}" return 0 # Contains target species else @@ -84,39 +116,57 @@ check_file_for_species() { # Download based on type case ${DOWNLOAD_TYPE} in - human_mouse) - echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..." - echo "This will check each file to see if it contains human or mouse sequences..." - category="vertebrate_mammalian" - echo "Checking files in ${category} category..." + human_mouse_drosophila_yeast) + echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..." + echo "This will check each file to see if it contains target species sequences..." - # Get list of files and save to temp file to avoid subshell issues - curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ - grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt - - file_count=0 - download_count=0 + # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母) + categories="vertebrate_mammalian invertebrate fungi" + total_file_count=0 + total_download_count=0 - while read filename; do - file_count=$((file_count + 1)) - url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" - echo -n "[${file_count}] Checking ${filename}... " + for category in ${categories}; do + echo "Checking files in ${category} category..." - if check_file_for_species "${url}" "${filename}"; then - echo "✓ contains target species, downloading..." - download_count=$((download_count + 1)) - wget -c -q --show-progress "${url}" || { - echo "Warning: Failed to download ${filename}" - } - else - echo "✗ skipping (no human/mouse data)" - fi - done < /tmp/refseq_files.txt + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + file_count=0 + download_count=0 + + while read filename; do + file_count=$((file_count + 1)) + total_file_count=$((total_file_count + 1)) + url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" + echo -n "[${total_file_count}] Checking ${category}/${filename}... " + + if check_file_for_species "${url}" "${filename}"; then + # Check if file is already downloaded + if check_file_downloaded "${filename}"; then + echo "✓ already downloaded (contains target species)" + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + else + echo "✓ contains target species, downloading..." + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + wget -c -q --show-progress "${url}" || { + echo "Warning: Failed to download ${filename}" + } + fi + else + echo "✗ skipping (no target species data)" + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt + echo " ${category}: Checked ${file_count} files, downloaded ${download_count} files." + done - rm -f /tmp/refseq_files.txt echo "" - echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences." + echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)." ;; representative) echo "Downloading RefSeq representative sequences (recommended, smaller size)..." @@ -124,52 +174,76 @@ case ${DOWNLOAD_TYPE} in # Note: You can modify this list based on your specific requirements for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do echo "Downloading ${category} sequences..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt done ;; complete) echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_complete.txt + + rm -f /tmp/refseq_files_complete.txt ;; all) echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..." # Download genomic sequences from all categories for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do echo "Downloading ${category} genomic sequences..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt done ;; *) echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" - echo "Usage: $0 [human_mouse|representative|complete|all]" - echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)" + echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]" + echo " human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)" echo " representative: Download major categories (recommended)" echo " complete: Download all complete genomic sequences (very large)" echo " all: Download all genomic sequences (extremely large)" diff --git a/examples/search/search_dna/search_dna.sh b/examples/search/search_dna/search_dna.sh new file mode 100644 index 00000000..ef51281d --- /dev/null +++ b/examples/search/search_dna/search_dna.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_dna/search_dna_config.yaml + diff --git a/examples/search/search_dna/search_dna_config.yaml b/examples/search/search_dna/search_dna_config.yaml new file mode 100644 index 00000000..db87b16e --- /dev/null +++ b/examples/search/search_dna/search_dna_config.yaml @@ -0,0 +1,31 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + graph_backend: kuzu # graph database backend, support: kuzu, networkx + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + save_output: true + params: + data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + ncbi_params: + email: test@example.com # NCBI requires an email address + tool: GraphGen # tool name for NCBI API + use_local_blast: true # whether to use local blast for DNA search + local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) + threshold: 0.01 # E-value threshold for BLAST search diff --git a/examples/search/search_dna_config.yaml b/examples/search/search_dna_config.yaml deleted file mode 100644 index f53a5eb8..00000000 --- a/examples/search/search_dna_config.yaml +++ /dev/null @@ -1,17 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral - ncbi_params: - email: test@example.com # NCBI requires an email address - tool: GraphGen # tool name for NCBI API - use_local_blast: true # whether to use local blast for DNA search - local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) - diff --git a/examples/search/search_protein/README.md b/examples/search/search_protein/README.md new file mode 100644 index 00000000..e470a936 --- /dev/null +++ b/examples/search/search_protein/README.md @@ -0,0 +1,80 @@ +# Search Protein Sequences + +This example demonstrates how to search protein sequences from UniProt database using BLAST. + +## Overview + +The protein search pipeline reads protein sequence queries and searches against UniProt database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh +``` + +The script will download UniProt Swiss-Prot database and create a BLAST database. You can configure the download mode: +- `sprot` (default): Download only Swiss-Prot (high quality, curated) +- `full`: Download both Swiss-Prot and TrEMBL (complete database) + +The script will create a BLAST database in `${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_protein_config.yaml` to set: + +- **Input file path**: Set the path to your protein sequence queries +- **UniProt parameters**: + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (format: `/path/to/${RELEASE}/uniprot_sprot`) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_protein_demo.jsonl + +data_sources: [uniprot] +uniprot_params: + use_local_blast: true + local_blast_db: /your_path/2024_01/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) +``` + +### 3. Run the Search + +```bash +./search_uniprot.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_protein/search_protein_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with protein sequence queries: + +```jsonl +{"type": "protein", "content": "P01308"} +{"type": "protein", "content": "insulin"} +{"type": "protein", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from UniProt. + +## Notes + +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- **Swiss-Prot** is recommended for high-quality, curated protein sequences +- **TrEMBL** contains automatically annotated sequences (larger database) +- The merged database (`uniprot_${RELEASE}`) contains both Swiss-Prot and TrEMBL +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/search_protein/build_db.sh b/examples/search/search_protein/build_db.sh new file mode 100755 index 00000000..da4c2b4b --- /dev/null +++ b/examples/search/search_protein/build_db.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -e + +# Downloads the latest release of UniProt, putting it in a release-specific directory. +# Creates associated BLAST databases. +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +echo "Downloading RELEASE.metalink..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink" + +# Extract the release name (like 2017_10 or 2017_1) +# Use sed for cross-platform compatibility (works on both macOS and Linux) +RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) + +echo "UniProt release: ${RELEASE}" +echo "" + +# Download Swiss-Prot (always needed) +echo "Downloading uniprot_sprot.fasta.gz..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" + +# Download TrEMBL only if full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + echo "Downloading uniprot_trembl.fasta.gz..." + wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +fi + +# Download metadata files +echo "Downloading metadata files..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE" + +cd .. + +mkdir -p ${RELEASE} +mv ${DOWNLOAD_TMP}/* ${RELEASE} +rmdir ${DOWNLOAD_TMP} + +cd ${RELEASE} + +echo "" +echo "Extracting files..." +gunzip uniprot_sprot.fasta.gz + +if [ "${DOWNLOAD_MODE}" = "full" ]; then + gunzip uniprot_trembl.fasta.gz + echo "Merging Swiss-Prot and TrEMBL..." + cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta +fi + +echo "" +echo "Building BLAST databases..." + +# Always build Swiss-Prot database +makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot + +# Build full release database only if in full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} + makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl +fi + +cd .. + +echo "" +echo "BLAST databases created successfully!" +echo "Database locations:" +if [ "${DOWNLOAD_MODE}" = "sprot" ]; then + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo "" + echo "To use this database, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot" +else + echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" + echo "" + echo "To use these databases, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" +fi + diff --git a/examples/search/search_protein/search_protein_config.yaml b/examples/search/search_protein/search_protein_config.yaml new file mode 100644 index 00000000..6e6f085c --- /dev/null +++ b/examples/search/search_protein/search_protein_config.yaml @@ -0,0 +1,30 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + graph_backend: kuzu # graph database backend, support: kuzu, networkx + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + save_output: true + params: + data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot + uniprot_params: + use_local_blast: true # whether to use local blast for uniprot search + local_blast_db: /path/to/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) + threshold: 0.01 # E-value threshold for BLAST search diff --git a/examples/search/search_protein/search_uniprot.sh b/examples/search/search_protein/search_uniprot.sh new file mode 100644 index 00000000..627735a0 --- /dev/null +++ b/examples/search/search_protein/search_uniprot.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_protein/search_protein_config.yaml diff --git a/examples/search/search_protein_config.yaml b/examples/search/search_protein_config.yaml deleted file mode 100644 index bfbf84eb..00000000 --- a/examples/search/search_protein_config.yaml +++ /dev/null @@ -1,15 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot - uniprot_params: - use_local_blast: true # whether to use local blast for uniprot search - local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot - # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) diff --git a/examples/search/search_rna.sh b/examples/search/search_rna.sh deleted file mode 100644 index 9243d6be..00000000 --- a/examples/search/search_rna.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file graphgen/configs/search_rna_config.yaml diff --git a/examples/search/search_rna/README.md b/examples/search/search_rna/README.md new file mode 100644 index 00000000..205c46b4 --- /dev/null +++ b/examples/search/search_rna/README.md @@ -0,0 +1,80 @@ +# Search RNA Sequences + +This example demonstrates how to search RNA sequences from RNAcentral database using BLAST. + +## Overview + +The RNA search pipeline reads RNA sequence queries and searches against RNAcentral database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh [all|list|selected|database_name...] +``` + +Options: +- `all`: Download complete active database (~8.4G compressed) +- `list`: List all available database subsets +- `selected`: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam) +- `database_name`: Download specific database subset (e.g., refseq, rfam, mirbase) + +The script will create a BLAST database in `rnacentral_${RELEASE}/` or `rnacentral_${DB_NAME}_${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_rna_config.yaml` to set: + +- **Input file path**: Set the path to your RNA sequence queries +- **RNAcentral parameters**: + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (without .nhr extension) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_rna_demo.jsonl + +data_sources: [rnacentral] +rnacentral_params: + use_local_blast: true + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD +``` + +### 3. Run the Search + +```bash +./search_rna.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_rna/search_rna_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with RNA sequence queries: + +```jsonl +{"type": "rna", "content": "miR-21"} +{"type": "rna", "content": ">query\nAUGCAUGC..."} +{"type": "rna", "content": "AUGCAUGC..."} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from RNAcentral. + +## Notes + +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- The complete RNAcentral database is large (~8.4G compressed), consider using specific database subsets for smaller downloads +- RNAcentral uses URS IDs (e.g., URS000149A9AF) which match the online RNAcentral API database +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/search_rna/build_db.sh b/examples/search/search_rna/build_db.sh new file mode 100755 index 00000000..af688ac1 --- /dev/null +++ b/examples/search/search_rna/build_db.sh @@ -0,0 +1,433 @@ +#!/bin/bash + +set -e + +# Downloads RNAcentral sequences and creates BLAST databases. +# This script downloads the RNAcentral active database, which is the same +# data source used for online RNAcentral searches, ensuring consistency +# between local and online search results. +# +# RNAcentral is a comprehensive database of non-coding RNA sequences that +# integrates data from multiple expert databases including RefSeq, Rfam, etc. +# +# Usage: ./build_rna_blast_db.sh [all|list|selected|database_name...] +# all (default): Download complete active database (~8.4G compressed) +# list: List all available database subsets +# selected: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase) +# database_name: Download specific database subset (e.g., refseq, rfam, mirbase) +# database_name1 database_name2 ...: Download multiple database subsets +# +# Available database subsets (examples): +# - refseq.fasta (~98M): RefSeq RNA sequences +# - rfam.fasta (~1.5G): Rfam RNA families +# - mirbase.fasta (~10M): microRNA sequences +# - ensembl_gencode.fasta (~337M): Ensembl/GENCODE annotations (human) +# - gtrnadb.fasta (~38M): tRNA sequences +# - lncbase.fasta (~106K): Human lncRNA database +# - See "list" option for complete list +# +# The complete "active" database contains all sequences from all expert databases. +# Using a specific database subset provides a smaller, focused database. +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +# RNAcentral base URL (using EBI HTTPS) +# NOTE: RNAcentral only has one official mirror at EBI +RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" +RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" +RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" +RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" + +# Parse command line arguments +DB_SELECTION=${1:-selected} + +# Predefined database list for "selected" option +SELECTED_DATABASES=("ensembl_gencode" "mirbase" "gtrnadb" "refseq" "lncbase" "rfam") + +# List available databases if requested +if [ "${DB_SELECTION}" = "list" ]; then + echo "Available RNAcentral database subsets:" + echo "" + echo "Fetching list from RNAcentral..." + listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") + echo "${listing}" | \ + grep -oE '' | \ + sed 's///' | \ + sort | \ + while read db; do + size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown") + echo " - ${db%.fasta}: ${size}" + done + echo "" + echo "Usage: $0 [all|list|selected|database_name...]" + echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" + echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" + echo " Example: $0 selected # Download predefined databases (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam)" + echo " Example: $0 refseq mirbase # Download multiple databases" + echo " Example: $0 all # Download complete active database (~8.4G)" + exit 0 +fi + +# Determine which databases to download +if [ "${DB_SELECTION}" = "selected" ]; then + # Use predefined database list + DATABASES=("${SELECTED_DATABASES[@]}") + echo "Downloading selected databases: ${DATABASES[*]}" +elif [ "${DB_SELECTION}" = "all" ]; then + # Single database mode (all) + DATABASES=("all") +else + # Multiple databases provided as arguments + DATABASES=("$@") +fi + +# Get RNAcentral release version from release notes (once for all databases) +echo "Getting RNAcentral release information..." +RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" +RELEASE_NOTES_TMP=$(mktemp) +wget -q "${RELEASE_NOTES_URL}" -O "${RELEASE_NOTES_TMP}" 2>/dev/null || { + echo "Warning: Could not download release notes, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "${RELEASE_NOTES_TMP}" ] && [ -s "${RELEASE_NOTES_TMP}" ]; then + # Try to extract version from release notes (first line usually contains version info) + RELEASE=$(head -1 "${RELEASE_NOTES_TMP}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') + rm -f "${RELEASE_NOTES_TMP}" +fi + +if [ -z "${RELEASE}" ]; then + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +else + echo "RNAcentral release: ${RELEASE}" +fi + +# Process each database +DB_COUNT=${#DATABASES[@]} +DB_INDEX=0 + +for DB_SELECTION in "${DATABASES[@]}"; do + DB_INDEX=$((DB_INDEX + 1)) + echo "" + echo "==========================================" + echo "Processing database ${DB_INDEX}/${DB_COUNT}: ${DB_SELECTION}" + echo "==========================================" + echo "" + + # Check if database already exists and is complete + # First check with current release version + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + fi + + # Check if BLAST database already exists with current release + if [ -d "${OUTPUT_DIR}" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nhr" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists and appears complete: ${OUTPUT_DIR}/" + echo " BLAST database: ${OUTPUT_DIR}/${DB_OUTPUT_NAME}" + echo " Skipping download and database creation..." + continue + fi + + # Also check for any existing version of this database (e.g., different release dates) + EXISTING_DIR=$(ls -d rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + if [ -n "${EXISTING_DIR}" ] && [ "${DB_SELECTION}" != "all" ]; then + EXISTING_DB_NAME=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + if [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nhr" ] && [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists (version ${EXISTING_DB_NAME}): ${EXISTING_DIR}/" + echo " BLAST database: ${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}" + echo " Skipping download and database creation..." + echo " Note: Using existing version ${EXISTING_DB_NAME} instead of ${RELEASE}" + continue + fi + fi + + # Better to use a stable DOWNLOAD_TMP name to support resuming downloads + DOWNLOAD_TMP="_downloading_rnacentral_${DB_SELECTION}" + mkdir -p ${DOWNLOAD_TMP} + cd ${DOWNLOAD_TMP} + + # Download RNAcentral FASTA file + if [ "${DB_SELECTION}" = "all" ]; then + # Download complete active database + FASTA_FILE="rnacentral_active.fasta.gz" + DB_NAME="rnacentral" + echo "Downloading RNAcentral active sequences (~8.4G)..." + echo " Contains sequences currently present in at least one expert database" + echo " Uses standard URS IDs (e.g., URS000149A9AF)" + echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" + FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" + IS_COMPRESSED=true + else + # Download specific database subset + DB_NAME="${DB_SELECTION}" + FASTA_FILE="${DB_SELECTION}.fasta" + echo "Downloading RNAcentral database subset: ${DB_SELECTION}" + echo " This is a subset of the active database from a specific expert database" + echo " File: ${FASTA_FILE}" + FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" + IS_COMPRESSED=false + + # Check if database exists (use HTTP status code check for HTTPS) + HTTP_CODE=$(curl -s --max-time 10 -o /dev/null -w "%{http_code}" "${FASTA_URL}" 2>/dev/null | tail -1 || echo "000") + if ! echo "${HTTP_CODE}" | grep -q "^200$"; then + echo "Error: Database '${DB_SELECTION}' not found (HTTP code: ${HTTP_CODE})" + echo "Run '$0 list' to see available databases" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + fi + + echo "Downloading from: ${FASTA_URL}" + echo "This may take a while depending on your internet connection..." + if [ "${DB_SELECTION}" = "all" ]; then + echo "File size is approximately 8-9GB, please be patient..." + else + echo "Downloading database subset..." + fi + + wget -c "${FASTA_URL}" || { + echo "Error: Failed to download RNAcentral FASTA file" + echo "Please check your internet connection and try again" + echo "URL: ${FASTA_URL}" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + } + + if [ ! -f "${FASTA_FILE}" ]; then + echo "Error: Downloaded file not found" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + + cd .. + + # Create release directory + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" + fi + mkdir -p ${OUTPUT_DIR} + mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true + rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + + cd ${OUTPUT_DIR} + + # Extract FASTA file if compressed + echo "Preparing RNAcentral sequences..." + if [ -f "${FASTA_FILE}" ]; then + if [ "${IS_COMPRESSED}" = "true" ]; then + echo "Decompressing ${FASTA_FILE}..." + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { + echo "Error: Failed to decompress FASTA file" + cd .. + exit 1 + } + # Optionally remove the compressed file to save space + # rm "${FASTA_FILE}" + else + # File is not compressed, just copy/rename + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { + echo "Error: Failed to copy FASTA file" + cd .. + exit 1 + } + fi + else + echo "Error: FASTA file not found" + cd .. + exit 1 + fi + + # Check if we have sequences + if [ ! -s "${OUTPUT_FASTA}" ]; then + echo "Error: FASTA file is empty" + cd .. + exit 1 + fi + + # Get file size for user information + FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) + echo "FASTA file size: ${FILE_SIZE}" + + echo "Creating BLAST database..." + # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) + # Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, + # which matches the format expected by the RNACentralSearch class + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + makeblastdb -in "${OUTPUT_FASTA}" \ + -out "${DB_OUTPUT_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_${DB_NAME}_${RELEASE}" + + echo "" + echo "BLAST database created successfully!" + echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "To use this database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "Note: The database files are:" + ls -lh ${DB_OUTPUT_NAME}.* | head -5 + echo "" + if [ "${DB_SELECTION}" = "all" ]; then + echo "This database uses RNAcentral IDs (URS...), which matches the online" + echo "RNAcentral search API, ensuring consistent results between local and online searches." + else + echo "This is a subset database from ${DB_SELECTION} expert database." + echo "For full coverage matching online API, use 'all' option." + fi + + cd .. +done + +echo "" +echo "==========================================" +echo "All databases processed successfully!" +echo "==========================================" +echo "" + +# If multiple databases were downloaded, offer to merge them +if [ ${#DATABASES[@]} -gt 1 ] && [ "${DATABASES[0]}" != "all" ]; then + echo "Multiple databases downloaded. Creating merged database for unified search..." + MERGED_DIR="rnacentral_merged_${RELEASE}" + mkdir -p ${MERGED_DIR} + cd ${MERGED_DIR} + + MERGED_FASTA="rnacentral_merged_${RELEASE}.fasta" + MERGED_FASTA_TMP="${MERGED_FASTA}.tmp" + echo "Combining FASTA files from all databases..." + echo " Note: Duplicate sequence IDs will be removed (keeping first occurrence)..." + + # Combine all FASTA files into a temporary file + # Find actual database directories (may have different release versions) + FOUND_ANY=false + for DB_SELECTION in "${DATABASES[@]}"; do + [ "${DB_SELECTION}" = "all" ] && continue + + # Try current release version first, then search for any existing version + OUTPUT_FASTA="../rnacentral_${DB_SELECTION}_${RELEASE}/${DB_SELECTION}_${RELEASE}.fasta" + [ ! -f "${OUTPUT_FASTA}" ] && { + EXISTING_DIR=$(ls -d ../rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + [ -n "${EXISTING_DIR}" ] && { + EXISTING_VERSION=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + OUTPUT_FASTA="${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_VERSION}.fasta" + } + } + + if [ -f "${OUTPUT_FASTA}" ]; then + echo " Adding ${DB_SELECTION} sequences..." + cat "${OUTPUT_FASTA}" >> "${MERGED_FASTA_TMP}" + FOUND_ANY=true + else + echo " Warning: Could not find FASTA file for ${DB_SELECTION}" + fi + done + + # Validate that we have files to merge + if [ "${FOUND_ANY}" = "false" ] || [ ! -s "${MERGED_FASTA_TMP}" ]; then + echo "Error: No FASTA files found to merge" + cd .. + rm -rf ${MERGED_DIR} + exit 1 + fi + + # Remove duplicates based on sequence ID (keeping first occurrence) + echo " Removing duplicate sequence IDs..." + awk ' + /^>/ { + # Process previous sequence if we have one + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + seen[current_id] = 1 + } + # Start new sequence + current_header = $0 + current_id = substr($0, 2) + sub(/[ \t].*/, "", current_id) # Extract ID up to first space/tab + current_seq = "" + next + } + { + # Accumulate sequence data by concatenating lines + current_seq = current_seq $0 + } + END { + # Process last sequence + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + } + } + ' "${MERGED_FASTA_TMP}" > "${MERGED_FASTA}" + rm -f "${MERGED_FASTA_TMP}" + + # Check if merged file was created and has content + if [ ! -s "${MERGED_FASTA}" ]; then + echo "Warning: Merged FASTA file is empty or not created" + cd .. + rm -rf ${MERGED_DIR} + else + FILE_SIZE=$(du -h "${MERGED_FASTA}" | cut -f1) + echo "Merged FASTA file size: ${FILE_SIZE}" + + echo "Creating merged BLAST database..." + MERGED_DB_NAME="rnacentral_merged_${RELEASE}" + makeblastdb -in "${MERGED_FASTA}" \ + -out "${MERGED_DB_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_Merged_${RELEASE}" + + echo "" + echo "✓ Merged BLAST database created successfully!" + echo "Database location: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "To use the merged database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "Note: The merged database includes: ${DATABASES[*]}" + cd .. + fi +fi + +echo "" +echo "Summary of downloaded databases:" +for DB_SELECTION in "${DATABASES[@]}"; do + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + fi + if [ -d "${OUTPUT_DIR}" ]; then + echo " - ${DB_NAME}: ${OUTPUT_DIR}/" + fi +done + +if [ -d "rnacentral_merged_${RELEASE}" ]; then + echo " - merged (all databases): rnacentral_merged_${RELEASE}/" + echo "" + echo "💡 Recommendation: Use the merged database for searching across all databases." +fi + diff --git a/examples/search/search_rna/search_rna.sh b/examples/search/search_rna/search_rna.sh new file mode 100644 index 00000000..04206c17 --- /dev/null +++ b/examples/search/search_rna/search_rna.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_rna/search_rna_config.yaml + diff --git a/examples/search/search_rna/search_rna_config.yaml b/examples/search/search_rna/search_rna_config.yaml new file mode 100644 index 00000000..c19793e8 --- /dev/null +++ b/examples/search/search_rna/search_rna_config.yaml @@ -0,0 +1,29 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + graph_backend: kuzu # graph database backend, support: kuzu, networkx + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + save_output: true + params: + data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + rnacentral_params: + use_local_blast: true # whether to use local blast for RNA search + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) + threshold: 0.01 # E-value threshold for BLAST search diff --git a/examples/search/search_rna_config.yaml b/examples/search/search_rna_config.yaml deleted file mode 100644 index 10422988..00000000 --- a/examples/search/search_rna_config.yaml +++ /dev/null @@ -1,14 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral - rnacentral_params: - use_local_blast: true # whether to use local blast for RNA search - local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) diff --git a/examples/search/search_uniprot.sh b/examples/search/search_uniprot.sh deleted file mode 100644 index 8cb666c0..00000000 --- a/examples/search/search_uniprot.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file graphgen/configs/search_protein_config.yaml diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py index 300d3178..be4c737e 100644 --- a/graphgen/bases/base_operator.py +++ b/graphgen/bases/base_operator.py @@ -6,11 +6,12 @@ import pandas as pd import ray -from graphgen.utils import CURRENT_LOGGER_VAR, set_logger - class BaseOperator(ABC): def __init__(self, working_dir: str = "cache", op_name: str = None): + # lazy import to avoid circular import + from graphgen.utils import set_logger + log_dir = os.path.join(working_dir, "logs") self.op_name = op_name or self.__class__.__name__ @@ -39,6 +40,9 @@ def __init__(self, working_dir: str = "cache", op_name: str = None): def __call__( self, batch: pd.DataFrame ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]: + # lazy import to avoid circular import + from graphgen.utils import CURRENT_LOGGER_VAR + logger_token = CURRENT_LOGGER_VAR.set(self.logger) try: result = self.process(batch) diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py index 5d2af735..ba72f410 100644 --- a/graphgen/bases/base_reader.py +++ b/graphgen/bases/base_reader.py @@ -39,6 +39,8 @@ def _should_keep_item(self, item: Dict[str, Any]) -> bool: "table", "equation", "protein", + "dna", + "rna", ], f"Unsupported item type: {item_type}" if item_type == "text": content = item.get(self.text_column, "").strip() diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py index f680ab04..61845e32 100644 --- a/graphgen/bases/base_searcher.py +++ b/graphgen/bases/base_searcher.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, List +from typing import Any, Dict, Optional class BaseSearcher(ABC): @@ -8,11 +8,11 @@ class BaseSearcher(ABC): """ @abstractmethod - async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: + def search(self, query: str, **kwargs) -> Optional[Dict[str, Any]]: """ Search for data based on the given query. :param query: The searcher query. :param kwargs: Additional keyword arguments for the searcher. - :return: List of dictionaries containing the searcher results. + :return: Dictionary containing the searcher result, or None if not found. """ diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index f453c700..5fb6ffb1 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -1,10 +1,7 @@ -import asyncio import os import re import subprocess import tempfile -from concurrent.futures import ThreadPoolExecutor -from functools import lru_cache from http.client import IncompleteRead from typing import Dict, Optional @@ -22,15 +19,6 @@ from graphgen.utils import logger -@lru_cache(maxsize=None) -def _get_pool(): - return ThreadPoolExecutor(max_workers=10) - - -# ensure only one NCBI request at a time -_ncbi_lock = asyncio.Lock() - - class NCBISearch(BaseSearcher): """ NCBI Search client to search DNA/GenBank/Entrez databases. @@ -49,6 +37,8 @@ def __init__( email: str = "email@example.com", api_key: str = "", tool: str = "GraphGen", + blast_num_threads: int = 4, + threshold: float = 0.01, ): """ Initialize the NCBI Search client. @@ -59,8 +49,8 @@ def __init__( email (str): Email address for NCBI API requests. api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. tool (str): Tool name for NCBI API requests. + blast_num_threads (int): Number of threads for BLAST search. """ - super().__init__() Entrez.timeout = 60 # 60 seconds timeout Entrez.email = email Entrez.tool = tool @@ -70,9 +60,23 @@ def __init__( Entrez.sleep_between_tries = 5 self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db - if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): - logger.error("Local BLAST database files not found. Please check the path.") - self.use_local_blast = False + self.blast_num_threads = blast_num_threads + self.threshold = threshold + if self.use_local_blast: + # Check for single-file database (.nhr) or multi-file database (.00.nhr) + db_exists = os.path.isfile(f"{self.local_blast_db}.nhr") or os.path.isfile( + f"{self.local_blast_db}.00.nhr" + ) + if not db_exists: + logger.error( + "Local BLAST database files not found. Please check the path." + ) + logger.error( + "Expected: %s.nhr or %s.00.nhr", + self.local_blast_db, + self.local_blast_db, + ) + self.use_local_blast = False @staticmethod def _nested_get(data: dict, *keys, default=None): @@ -84,17 +88,21 @@ def _nested_get(data: dict, *keys, default=None): return data @staticmethod - def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]: + def _infer_molecule_type_detail( + accession: Optional[str], gene_type: Optional[int] = None + ) -> Optional[str]: """Infer molecule_type_detail from accession prefix or gene type.""" if accession: - if accession.startswith(("NM_", "XM_")): - return "mRNA" - if accession.startswith(("NC_", "NT_")): - return "genomic DNA" - if accession.startswith(("NR_", "XR_")): - return "RNA" - if accession.startswith("NG_"): - return "genomic region" + # Map accession prefixes to molecule types + prefix_map = { + ("NM_", "XM_"): "mRNA", + ("NC_", "NT_"): "genomic DNA", + ("NR_", "XR_"): "RNA", + ("NG_",): "genomic region", + } + for prefixes, mol_type in prefix_map.items(): + if accession.startswith(prefixes): + return mol_type # Fallback: infer from gene type if available if gene_type is not None: gene_type_map = { @@ -126,20 +134,25 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: gene_synonyms = [] if isinstance(synonyms_raw, list): for syn in synonyms_raw: - gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)) + gene_synonyms.append( + syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn) + ) elif synonyms_raw: gene_synonyms.append(str(synonyms_raw)) # Extract location info label = locus.get("Gene-commentary_label", "") - chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None + chromosome_match = ( + re.search(r"Chromosome\s+(\S+)", str(label)) if label else None + ) seq_interval = self._nested_get( locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} ) genomic_location = ( f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}" - if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to') + if seq_interval.get("Seq-interval_from") + and seq_interval.get("Seq-interval_to") else None ) @@ -153,7 +166,6 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: None, ) # Fallback: if no type 3 accession, try any available accession - # This is needed for genes that don't have mRNA transcripts but have other sequence records if not representative_accession: representative_accession = next( ( @@ -170,7 +182,8 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: comment.get("Gene-commentary_comment") for comment in data.get("Entrezgene_comments", []) if isinstance(comment, dict) - and "function" in str(comment.get("Gene-commentary_heading", "")).lower() + and "function" + in str(comment.get("Gene-commentary_heading", "")).lower() ), None, ) @@ -194,7 +207,9 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: "5": "snRNA", "6": "ncRNA", "7": "other", - }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"), + }.get( + str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}" + ), "chromosome": chromosome_match.group(1) if chromosome_match else None, "genomic_location": genomic_location, "function": function, @@ -209,25 +224,33 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: "_representative_accession": representative_accession, } - def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + def get_by_gene_id( + self, gene_id: str, preferred_accession: Optional[str] = None + ) -> Optional[dict]: """Get gene information by Gene ID.""" + def _extract_metadata_from_genbank(result: dict, accession: str): """Extract metadata from GenBank format (title, features, organism, etc.).""" - with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle: + with Entrez.efetch( + db="nuccore", id=accession, rettype="gb", retmode="text" + ) as handle: record = SeqIO.read(handle, "genbank") result["title"] = record.description result["molecule_type_detail"] = ( - "mRNA" if accession.startswith(("NM_", "XM_")) else - "genomic DNA" if accession.startswith(("NC_", "NT_")) else - "RNA" if accession.startswith(("NR_", "XR_")) else - "genomic region" if accession.startswith("NG_") else "N/A" + self._infer_molecule_type_detail(accession) or "N/A" ) for feature in record.features: if feature.type == "source": - if 'chromosome' in feature.qualifiers: - result["chromosome"] = feature.qualifiers['chromosome'][0] + if "chromosome" in feature.qualifiers: + result["chromosome"] = feature.qualifiers["chromosome"][0] if feature.location: start = int(feature.location.start) + 1 @@ -236,48 +259,91 @@ def _extract_metadata_from_genbank(result: dict, accession: str): break - if not result.get("organism") and 'organism' in record.annotations: - result["organism"] = record.annotations['organism'] + if not result.get("organism") and "organism" in record.annotations: + result["organism"] = record.annotations["organism"] return result def _extract_sequence_from_fasta(result: dict, accession: str): """Extract sequence from FASTA format (more reliable than GenBank for CON-type records).""" try: - with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle: + with Entrez.efetch( + db="nuccore", id=accession, rettype="fasta", retmode="text" + ) as fasta_handle: fasta_record = SeqIO.read(fasta_handle, "fasta") result["sequence"] = str(fasta_record.seq) result["sequence_length"] = len(fasta_record.seq) except Exception as fasta_exc: logger.warning( "Failed to extract sequence from accession %s using FASTA format: %s", - accession, fasta_exc + accession, + fasta_exc, ) result["sequence"] = None result["sequence_length"] = None return result + def _extract_sequence(result: dict, accession: str): + """ + Extract sequence using the appropriate method based on configuration. + If use_local_blast=True, use local database. Otherwise, use NCBI API. + Always fetches sequence (no option to skip). + """ + # If using local BLAST, use local database + if self.use_local_blast: + sequence = self._extract_sequence_from_local_db(accession) + + if sequence: + result["sequence"] = sequence + result["sequence_length"] = len(sequence) + else: + # Failed to extract from local DB, set to None (no fallback to API) + result["sequence"] = None + result["sequence_length"] = None + logger.warning( + "Failed to extract sequence from local DB for accession %s. " + "Not falling back to NCBI API as use_local_blast=True.", + accession, + ) + else: + # Use NCBI API to fetch sequence + result = _extract_sequence_from_fasta(result, accession) + + return result + try: with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: gene_record = Entrez.read(handle) - if not gene_record: - return None - result = self._gene_record_to_dict(gene_record, gene_id) - if accession := (preferred_accession or result.get("_representative_accession")): - result = _extract_metadata_from_genbank(result, accession) - result = _extract_sequence_from_fasta(result, accession) + if not gene_record: + return None + + result = self._gene_record_to_dict(gene_record, gene_id) + + if accession := ( + preferred_accession or result.get("_representative_accession") + ): + result = _extract_metadata_from_genbank(result, accession) + # Extract sequence using appropriate method + result = _extract_sequence(result, accession) - result.pop("_representative_accession", None) - return result + result.pop("_representative_accession", None) + return result except (RequestException, IncompleteRead): raise except Exception as exc: logger.error("Gene ID %s not found: %s", gene_id, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_by_accession(self, accession: str) -> Optional[dict]: """Get sequence information by accession number.""" + def _extract_gene_id(link_handle): """Extract GeneID from elink results.""" links = Entrez.read(link_handle) @@ -301,9 +367,11 @@ def _extract_gene_id(link_handle): return None result = self.get_by_gene_id(gene_id, preferred_accession=accession) + if result: result["id"] = accession result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + return result except (RequestException, IncompleteRead): raise @@ -311,6 +379,12 @@ def _extract_gene_id(link_handle): logger.error("Accession %s not found: %s", accession, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """Search NCBI Gene database with a keyword and return the best hit.""" if not keyword.strip(): @@ -318,33 +392,113 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: try: for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]: - with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle: + with Entrez.esearch( + db="gene", term=search_term, retmax=1, sort="relevance" + ) as search_handle: search_results = Entrez.read(search_handle) - if len(gene_id := search_results.get("IdList", [])) > 0: - return self.get_by_gene_id(gene_id) + + if len(gene_id := search_results.get("IdList", [])) > 0: + result = self.get_by_gene_id(gene_id[0]) + return result except (RequestException, IncompleteRead): raise except Exception as e: logger.error("Keyword %s not found: %s", keyword, e) return None + def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]: + """Extract sequence from local BLAST database using blastdbcmd.""" + try: + cmd = [ + "blastdbcmd", + "-db", + self.local_blast_db, + "-entry", + accession, + "-outfmt", + "%s", # Only sequence, no header + ] + sequence = subprocess.check_output( + cmd, + text=True, + timeout=10, # 10 second timeout for local extraction + stderr=subprocess.DEVNULL, + ).strip() + return sequence if sequence else None + except subprocess.TimeoutExpired: + logger.warning( + "Timeout extracting sequence from local DB for accession %s", accession + ) + return None + except Exception as exc: + logger.warning( + "Failed to extract sequence from local DB for accession %s: %s", + accession, + exc, + ) + return None + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """Perform local BLAST search using local BLAST database.""" + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ try: - with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".fa", delete=False + ) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ - "blastn", "-db", self.local_blast_db, "-query", tmp_name, - "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + "blastn", + "-db", + self.local_blast_db, + "-query", + tmp_name, + "-evalue", + str(threshold), + "-max_target_seqs", + "1", + "-num_threads", + str(self.blast_num_threads), + "-outfmt", + "6 sacc", # Only accession, tab-separated ] - logger.debug("Running local blastn: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + logger.debug( + "Running local blastn (threads=%d): %s", + self.blast_num_threads, + " ".join(cmd), + ) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL, # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) return out.split("\n", maxsplit=1)[0] if out else None except Exception as exc: logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if "tmp_name" in locals(): + os.remove(tmp_name) + except Exception: + pass return None def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: @@ -358,8 +512,9 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: seq = sequence.strip().replace(" ", "").replace("\n", "") return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None - - def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]: + def _process_network_blast_result( + blast_record, seq: str, threshold: float + ) -> Optional[dict]: """Process network BLAST result and return dictionary or None.""" if not blast_record.alignments: logger.info("No BLAST hits found for the given sequence.") @@ -383,7 +538,9 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O "title": best_alignment.title, "sequence_length": len(seq), "e_value": best_hsp.expect, - "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "identity": best_hsp.identities / best_hsp.align_length + if best_hsp.align_length > 0 + else 0, "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", } @@ -393,15 +550,31 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O return None # Try local BLAST first if enabled - if self.use_local_blast and (accession := self._local_blast(seq, threshold)): - logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_accession(accession) + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + + if accession: + logger.debug("Local BLAST found accession: %s", accession) + # When using local BLAST, skip sequence fetching by default (faster, fewer API calls) + # Sequence is already known from the query, so we only need metadata + result = self.get_by_accession(accession) + return result + + logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None - # Fall back to network BLAST + # Fall back to network BLAST only if local BLAST is not enabled logger.debug("Falling back to NCBIWWW.qblast") - - with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle: - return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + with NCBIWWW.qblast( + "blastn", "nr", seq, hitlist_size=1, expect=threshold + ) as result_handle: + result = _process_network_blast_result( + NCBIXML.read(result_handle), seq, threshold + ) + return result except (RequestException, IncompleteRead): raise except Exception as e: @@ -414,8 +587,9 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O retry=retry_if_exception_type((RequestException, IncompleteRead)), reraise=True, ) - async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]: + def search(self, query: str, threshold: float = None, **kwargs) -> Optional[Dict]: """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.""" + threshold = threshold or self.threshold if not query or not isinstance(query, str): logger.error("Empty or non-string input.") return None @@ -423,19 +597,21 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona query = query.strip() logger.debug("NCBI search query: %s", query) - loop = asyncio.get_running_loop() - - # limit concurrent requests (NCBI rate limit: max 3 requests per second) - async with _ncbi_lock: - # Auto-detect query type and execute in thread pool - if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) - elif re.fullmatch(r"^\d+$", query): - result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) - elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) - else: - result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + # Auto-detect query type and execute + # All methods call NCBI API (rate limit: max 3 requests per second) + # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + # FASTA sequence + result = self.get_by_fasta(query, threshold) + elif re.fullmatch(r"^\d+$", query): + # Gene ID + result = self.get_by_gene_id(query) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + # Accession + result = self.get_by_accession(query) + else: + # Keyword + result = self.get_best_hit(query) if result: result["_search_query"] = query diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 58c5e86e..6b7f77a2 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -1,15 +1,11 @@ -import asyncio +import hashlib import os import re import subprocess -from concurrent.futures import ThreadPoolExecutor -from functools import lru_cache import tempfile -from typing import Dict, Optional, List, Any, Set +from typing import Any, Dict, List, Optional, Set -import hashlib import requests -import aiohttp from tenacity import ( retry, retry_if_exception_type, @@ -21,10 +17,6 @@ from graphgen.utils import logger -@lru_cache(maxsize=None) -def _get_pool(): - return ThreadPoolExecutor(max_workers=10) - class RNACentralSearch(BaseSearcher): """ RNAcentral Search client to search RNA databases. @@ -35,12 +27,22 @@ class RNACentralSearch(BaseSearcher): API Documentation: https://rnacentral.org/api/v1 """ - def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"): - super().__init__() + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "rna_db", + api_timeout: int = 30, + blast_num_threads: int = 4, + threshold: float = 0.01, + ): self.base_url = "https://rnacentral.org/api/v1" self.headers = {"Accept": "application/json"} self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db + self.api_timeout = api_timeout + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + self.threshold = threshold # E-value threshold for BLAST search + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False @@ -49,7 +51,7 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db" def _rna_data_to_dict( rna_id: str, rna_data: Dict[str, Any], - xrefs_data: Optional[List[Dict[str, Any]]] = None + xrefs_data: Optional[List[Dict[str, Any]]] = None, ) -> Dict[str, Any]: organisms, gene_names, so_terms = set(), set(), set() modifications: List[Any] = [] @@ -58,7 +60,8 @@ def _rna_data_to_dict( acc = xref.get("accession", {}) if s := acc.get("species"): organisms.add(s) - if g := acc.get("gene", "").strip(): + gene_value = acc.get("gene") + if isinstance(gene_value, str) and (g := gene_value.strip()): gene_names.add(g) if m := xref.get("modifications"): modifications.extend(m) @@ -137,7 +140,9 @@ def _calculate_md5(sequence: str) -> str: # Normalize sequence normalized_seq = sequence.replace("U", "T").replace("u", "t").upper() if not re.fullmatch(r"[ATCGN]+", normalized_seq): - raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...") + raise ValueError( + f"Invalid sequence characters after normalization: {normalized_seq[:50]}..." + ) return hashlib.md5(normalized_seq.encode("ascii")).hexdigest() @@ -151,12 +156,21 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]: url = f"{self.base_url}/rna/{rna_id}" url += "?flat=true" - resp = requests.get(url, headers=self.headers, timeout=30) + resp = requests.get(url, headers=self.headers, timeout=self.api_timeout) resp.raise_for_status() rna_data = resp.json() xrefs_data = rna_data.get("xrefs", []) - return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + return result + except requests.Timeout as e: + logger.warning( + "Timeout getting RNA ID %s (timeout=%ds): %s", + rna_id, + self.api_timeout, + e, + ) + return None except requests.RequestException as e: logger.error("Network error getting RNA ID %s: %s", rna_id, e) return None @@ -164,6 +178,12 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]: logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) return None + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((requests.Timeout, requests.RequestException)), + reraise=False, + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. @@ -178,7 +198,9 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: try: url = f"{self.base_url}/rna" params = {"search": keyword, "format": "json"} - resp = requests.get(url, params=params, headers=self.headers, timeout=30) + resp = requests.get( + url, params=params, headers=self.headers, timeout=self.api_timeout + ) resp.raise_for_status() data = resp.json() @@ -206,76 +228,146 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: return None def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """Perform local BLAST search using local BLAST database.""" + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ try: - with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + # Use temporary file for query sequence + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".fa", delete=False + ) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ - "blastn", "-db", self.local_blast_db, "-query", tmp_name, - "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + "blastn", + "-db", + self.local_blast_db, + "-query", + tmp_name, + "-evalue", + str(threshold), + "-max_target_seqs", + "1", + "-num_threads", + str(self.blast_num_threads), + "-outfmt", + "6 sacc", # Only accession, tab-separated ] - logger.debug("Running local blastn for RNA: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + logger.debug( + "Running local blastn for RNA (threads=%d): %s", + self.blast_num_threads, + " ".join(cmd), + ) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL, # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) return out.split("\n", maxsplit=1)[0] if out else None except Exception as exc: logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if "tmp_name" in locals(): + os.remove(tmp_name) + except Exception: + pass return None - def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: - """ - Search RNAcentral with an RNA sequence. - Tries local BLAST first if enabled, falls back to RNAcentral API. - Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information. - :param sequence: RNA sequence (FASTA format or raw sequence). - :param threshold: E-value threshold for BLAST search. - :return: A dictionary containing complete RNA information or None if not found. - """ - def _extract_sequence(sequence: str) -> Optional[str]: - """Extract and normalize RNA sequence from input.""" - if sequence.startswith(">"): - seq_lines = sequence.strip().split("\n") - seq = "".join(seq_lines[1:]) - else: - seq = sequence.strip().replace(" ", "").replace("\n", "") - return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + @staticmethod + def _extract_rna_sequence(sequence: str) -> Optional[str]: + """Extract and normalize RNA sequence from input.""" + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + # Accept both U (original RNA) and T + return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None + + def _search_with_local_blast(self, seq: str, threshold: float) -> Optional[dict]: + """Search using local BLAST database.""" + accession = self._local_blast(seq, threshold) + if not accession: + logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None + + logger.debug("Local BLAST found accession: %s", accession) + detailed = self.get_by_rna_id(accession) + if detailed: + return detailed + logger.info( + "Local BLAST found accession %s but could not retrieve metadata from API.", + accession, + ) + return None + + def _search_with_api(self, seq: str) -> Optional[dict]: + """Search using RNAcentral API with MD5 hash.""" + logger.debug("Falling back to RNAcentral API.") + md5_hash = self._calculate_md5(seq) + search_url = f"{self.base_url}/rna" + params = {"md5": md5_hash, "format": "json"} + + resp = requests.get( + search_url, params=params, headers=self.headers, timeout=60 + ) + resp.raise_for_status() + + search_results = resp.json() + results = search_results.get("results", []) + + if not results: + logger.info("No exact match found in RNAcentral for sequence") + return None + rna_id = results[0].get("rnacentral_id") + if not rna_id: + logger.error("No RNAcentral ID found in search results.") + return None + + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + # Fallback: use search result data if get_by_rna_id returns None + logger.debug( + "Using search result data for %s (get_by_rna_id returned None)", rna_id + ) + return self._rna_data_to_dict(rna_id, results[0]) + + def get_by_fasta( + self, sequence: str, threshold: float = 0.01 + ) -> Optional[dict]: + """Search RNAcentral with an RNA sequence.""" try: - seq = _extract_sequence(sequence) + seq = self._extract_rna_sequence(sequence) if not seq: logger.error("Empty or invalid RNA sequence provided.") return None - # Try local BLAST first if enabled if self.use_local_blast: - accession = self._local_blast(seq, threshold) - if accession: - logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_rna_id(accession) - - # Fall back to RNAcentral API if local BLAST didn't find result - logger.debug("Falling back to RNAcentral API.") - - md5_hash = self._calculate_md5(seq) - search_url = f"{self.base_url}/rna" - params = {"md5": md5_hash, "format": "json"} - - resp = requests.get(search_url, params=params, headers=self.headers, timeout=60) - resp.raise_for_status() - - search_results = resp.json() - results = search_results.get("results", []) - - if not results: - logger.info("No exact match found in RNAcentral for sequence") - return None - rna_id = results[0].get("rnacentral_id") - if not rna_id: - logger.error("No RNAcentral ID found in search results.") - return None - return self.get_by_rna_id(rna_id) + return self._search_with_local_blast(seq, threshold) + return self._search_with_api(seq) except Exception as e: logger.error("Sequence search failed: %s", e) return None @@ -283,11 +375,12 @@ def _extract_sequence(sequence: str) -> Optional[str]: @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), - retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), + retry=retry_if_exception_type((requests.Timeout, requests.RequestException)), reraise=True, ) - async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]: + def search(self, query: str, threshold: float = None, **kwargs) -> Optional[Dict]: """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.""" + threshold = threshold or self.threshold if not query or not isinstance(query, str): logger.error("Empty or non-string input.") return None @@ -295,19 +388,20 @@ async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional query = query.strip() logger.debug("RNAcentral search query: %s", query) - loop = asyncio.get_running_loop() - - # check if RNA sequence (AUCG characters, contains U) - if query.startswith(">") or ( - re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() - ): - result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) + # check if RNA sequence (AUCG or ATCG characters, contains U or T) + # Note: Sequences with T are also RNA sequences + is_rna_sequence = query.startswith(">") or ( + re.fullmatch(r"[AUCGTN\s]+", query, re.I) + and ("U" in query.upper() or "T" in query.upper()) + ) + if is_rna_sequence: + result = self.get_by_fasta(query, threshold) # check if RNAcentral ID (typically starts with URS) elif re.fullmatch(r"URS\d+", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query) + result = self.get_by_rna_id(query) else: # otherwise treat as keyword - result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + result = self.get_best_hit(query) if result: result["_search_query"] = query diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index f5542f8c..012addeb 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -1,10 +1,7 @@ -import asyncio import os import re import subprocess import tempfile -from concurrent.futures import ThreadPoolExecutor -from functools import lru_cache from io import StringIO from typing import Dict, Optional @@ -22,15 +19,6 @@ from graphgen.utils import logger -@lru_cache(maxsize=None) -def _get_pool(): - return ThreadPoolExecutor(max_workers=10) - - -# ensure only one BLAST searcher at a time -_blast_lock = asyncio.Lock() - - class UniProtSearch(BaseSearcher): """ UniProt Search client to searcher with UniProt. @@ -39,10 +27,18 @@ class UniProtSearch(BaseSearcher): 3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async. """ - def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"): - super().__init__() + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "sp_db", + blast_num_threads: int = 4, + threshold: float = 0.01, + ): self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + self.threshold = threshold + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"): logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False @@ -61,7 +57,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: @staticmethod def _swissprot_to_dict(record: SwissProt.Record) -> dict: - """error + """ Convert a SwissProt.Record to a dictionary. """ functions = [] @@ -104,75 +100,88 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]: logger.error("Keyword %s not found: %s", keyword, e) return None - def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: + + def _parse_fasta_sequence(self, fasta_sequence: str) -> Optional[str]: """ - Search UniProt with a FASTA sequence and return the best hit. + Parse and extract sequence from FASTA format. :param fasta_sequence: The FASTA sequence. - :param threshold: E-value threshold for BLAST searcher. - :return: A dictionary containing the best hit information or None if not found. + :return: Extracted sequence string or None if invalid. """ try: if fasta_sequence.startswith(">"): seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq) else: seq = fasta_sequence.strip() + return seq if seq else None except Exception as e: # pylint: disable=broad-except logger.error("Invalid FASTA sequence: %s", e) return None - if not seq: - logger.error("Empty FASTA sequence provided.") + def _search_with_local_blast(self, seq: str, threshold: float) -> Optional[Dict]: + """Search using local BLAST database.""" + accession = self._local_blast(seq, threshold) + if not accession: + logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) return None + logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) - accession = None - if self.use_local_blast: - accession = self._local_blast(seq, threshold) - if accession: - logger.debug("Local BLAST found accession: %s", accession) + def _search_with_network_blast(self, seq: str, threshold: float) -> Optional[Dict]: + """Search using network BLAST (NCBIWWW).""" + logger.debug("Falling back to NCBIWWW.qblast.") + try: + logger.debug("Performing BLAST searcher for the given sequence: %s", seq) + result_handle = NCBIWWW.qblast( + program="blastp", + database="swissprot", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + logger.error("BLAST searcher failed: %s", e) + return None - if not accession: - logger.debug("Falling back to NCBIWWW.qblast.") + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None - # UniProtKB/Swiss-Prot BLAST API - try: - logger.debug( - "Performing BLAST searcher for the given sequence: %s", seq - ) - result_handle = NCBIWWW.qblast( - program="blastp", - database="swissprot", - sequence=seq, - hitlist_size=1, - expect=threshold, - ) - blast_record = NCBIXML.read(result_handle) - except RequestException: - raise - except Exception as e: # pylint: disable=broad-except - logger.error("BLAST searcher failed: %s", e) - return None + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None - if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") - return None + # like sp|P01308.1|INS_HUMAN + hit_id = best_alignment.hit_id + accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id + return self.get_by_accession(accession) - best_alignment = blast_record.alignments[0] - best_hsp = best_alignment.hsps[0] - if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") - return None - hit_id = best_alignment.hit_id + def get_by_fasta( + self, fasta_sequence: str, threshold: float + ) -> Optional[Dict]: + """Search UniProt with a FASTA sequence and return the best hit.""" + seq = self._parse_fasta_sequence(fasta_sequence) + if not seq: + logger.error("Empty FASTA sequence provided.") + return None - # like sp|P01308.1|INS_HUMAN - accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id - return self.get_by_accession(accession) + search_method = ( + self._search_with_local_blast if self.use_local_blast + else self._search_with_network_blast + ) + return search_method(seq, threshold) def _local_blast(self, seq: str, threshold: float) -> Optional[str]: """ Perform local BLAST search using local BLAST database. - :param seq: The protein sequence. - :param threshold: E-value threshold for BLAST searcher. - :return: The accession number of the best hit or None if not found. + Optimized with multi-threading and faster output format. """ try: with tempfile.NamedTemporaryFile( @@ -181,6 +190,11 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastp", "-db", @@ -191,11 +205,30 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: str(threshold), "-max_target_seqs", "1", + "-num_threads", + str(self.blast_num_threads), "-outfmt", - "6 sacc", # only return accession + "6 sacc", # Only accession, tab-separated ] - logger.debug("Running local blastp: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + logger.debug( + "Running local blastp (threads=%d): %s", + self.blast_num_threads, + " ".join(cmd), + ) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL, # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) if out: return out.split("\n", maxsplit=1)[0] @@ -210,16 +243,14 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: retry=retry_if_exception_type(RequestException), reraise=True, ) - async def search( - self, query: str, threshold: float = 0.7, **kwargs - ) -> Optional[Dict]: + def search(self, query: str, threshold: float = None, **kwargs) -> Optional[Dict]: """ Search UniProt with either an accession number, keyword, or FASTA sequence. :param query: The searcher query (accession number, keyword, or FASTA sequence). :param threshold: E-value threshold for BLAST searcher. :return: A dictionary containing the best hit information or None if not found. """ - + threshold = threshold or self.threshold # auto detect query type if not query or not isinstance(query, str): logger.error("Empty or non-string input.") @@ -228,26 +259,21 @@ async def search( logger.debug("UniProt searcher query: %s", query) - loop = asyncio.get_running_loop() - # check if fasta sequence if query.startswith(">") or re.fullmatch( r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I ): - async with _blast_lock: - result = await loop.run_in_executor( - _get_pool(), self.get_by_fasta, query, threshold - ) + result = self.get_by_fasta(query, threshold) # check if accession number - elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I): - result = await loop.run_in_executor( - _get_pool(), self.get_by_accession, query - ) + # UniProt accession IDs: 6-10 characters, must start with a letter + # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric) + elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I): + result = self.get_by_accession(query) else: # otherwise treat as keyword - result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + result = self.get_best_hit(query) if result: result["_search_query"] = query diff --git a/graphgen/models/searcher/web/bing_search.py b/graphgen/models/searcher/web/bing_search.py index 77ae2110..36ac08bb 100644 --- a/graphgen/models/searcher/web/bing_search.py +++ b/graphgen/models/searcher/web/bing_search.py @@ -1,3 +1,9 @@ +""" +To use Bing Web Search API, +follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) +and obtain your Bing subscription key. +""" + import requests from fastapi import HTTPException diff --git a/graphgen/models/searcher/web/google_search.py b/graphgen/models/searcher/web/google_search.py index 0d598f3a..683b4191 100644 --- a/graphgen/models/searcher/web/google_search.py +++ b/graphgen/models/searcher/web/google_search.py @@ -1,3 +1,9 @@ +""" +To use Google Web Search API, +follow the instructions [here](https://developers.google.com/custom-search/v1/overview) +to get your Google searcher api key. +""" + import requests from fastapi import HTTPException diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index 64c78af5..5bb1261a 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -6,7 +6,7 @@ from .partition import PartitionService from .quiz import QuizService from .read import read -from .search import search_all +from .search import SearchService operators = { "read": read, @@ -15,7 +15,7 @@ "quiz": QuizService, "judge": JudgeService, "extract": ExtractService, - "search": search_all, + "search": SearchService, "partition": PartitionService, "generate": GenerateService, } diff --git a/graphgen/operators/search/__init__.py b/graphgen/operators/search/__init__.py index 3d90f12a..47144c77 100644 --- a/graphgen/operators/search/__init__.py +++ b/graphgen/operators/search/__init__.py @@ -1 +1 @@ -from .search_all import search_all +from .search_service import SearchService diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py deleted file mode 100644 index 6017cfee..00000000 --- a/graphgen/operators/search/search_all.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -To use Google Web Search API, -follow the instructions [here](https://developers.google.com/custom-search/v1/overview) -to get your Google searcher api key. - -To use Bing Web Search API, -follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) -and obtain your Bing subscription key. -""" - - -from graphgen.utils import logger, run_concurrent - - -async def search_all( - seed_data: dict, - search_config: dict, -) -> dict: - """ - Perform searches across multiple search types and aggregate the results. - :param seed_data: A dictionary containing seed data with entity names. - :param search_config: A dictionary specifying which data sources to use for searching. - :return: A dictionary with - """ - - results = {} - data_sources = search_config.get("data_sources", []) - - for data_source in data_sources: - data = list(seed_data.values()) - data = [d["content"] for d in data if "content" in d] - data = list(set(data)) # Remove duplicates - - if data_source == "uniprot": - from graphgen.models import UniProtSearch - - uniprot_search_client = UniProtSearch( - **search_config.get("uniprot_params", {}) - ) - - uniprot_results = await run_concurrent( - uniprot_search_client.search, - data, - desc="Searching UniProt database", - unit="keyword", - ) - results[data_source] = uniprot_results - - elif data_source == "ncbi": - from graphgen.models import NCBISearch - - ncbi_search_client = NCBISearch( - **search_config.get("ncbi_params", {}) - ) - - ncbi_results = await run_concurrent( - ncbi_search_client.search, - data, - desc="Searching NCBI database", - unit="keyword", - ) - results[data_source] = ncbi_results - - elif data_source == "rnacentral": - from graphgen.models import RNACentralSearch - - rnacentral_search_client = RNACentralSearch( - **search_config.get("rnacentral_params", {}) - ) - - rnacentral_results = await run_concurrent( - rnacentral_search_client.search, - data, - desc="Searching RNAcentral database", - unit="keyword", - ) - results[data_source] = rnacentral_results - - else: - logger.error("Data source %s not supported.", data_source) - continue - - return results diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py new file mode 100644 index 00000000..85c6e967 --- /dev/null +++ b/graphgen/operators/search/search_service.py @@ -0,0 +1,163 @@ +from functools import partial +from typing import Optional + +import pandas as pd + +from graphgen.bases import BaseOperator +from graphgen.common import init_storage +from graphgen.utils import compute_content_hash, logger, run_concurrent + + +class SearchService(BaseOperator): + """ + Service class for performing searches across multiple data sources. + Provides search functionality for UniProt, NCBI, and RNAcentral databases. + """ + + def __init__( + self, + working_dir: str = "cache", + kv_backend: str = "rocksdb", + data_sources: list = None, + **kwargs, + ): + super().__init__(working_dir=working_dir, op_name="search_service") + self.working_dir = working_dir + self.data_sources = data_sources or [] + self.kwargs = kwargs + self.search_storage = init_storage( + backend=kv_backend, working_dir=working_dir, namespace="search" + ) + self.searchers = {} + + def _init_searchers(self): + """ + Initialize all searchers (deferred import to avoid circular imports). + """ + for datasource in self.data_sources: + if datasource in self.searchers: + continue + if datasource == "uniprot": + from graphgen.models import UniProtSearch + + params = self.kwargs.get("uniprot_params", {}) + self.searchers[datasource] = UniProtSearch(**params) + elif datasource == "ncbi": + from graphgen.models import NCBISearch + + params = self.kwargs.get("ncbi_params", {}) + self.searchers[datasource] = NCBISearch(**params) + elif datasource == "rnacentral": + from graphgen.models import RNACentralSearch + + params = self.kwargs.get("rnacentral_params", {}) + self.searchers[datasource] = RNACentralSearch(**params) + else: + logger.error(f"Unknown data source: {datasource}, skipping") + + @staticmethod + async def _perform_search( + seed: dict, searcher_obj, data_source: str + ) -> Optional[dict]: + """ + Perform search for a single seed using the specified searcher. + + :param seed: The seed document with 'content' field + :param searcher_obj: The searcher instance + :param data_source: The data source name + :return: Search result with metadata + """ + query = seed.get("content", "") + + if not query: + logger.warning("Empty query for seed: %s", seed) + return None + + result = searcher_obj.search(query) + if result: + result["_doc_id"] = compute_content_hash(str(data_source) + query, "doc-") + result["data_source"] = data_source + result["type"] = seed.get("type", "text") + + return result + + def _process_single_source( + self, data_source: str, seed_data: list[dict] + ) -> list[dict]: + """ + process a single data source: check cache, search missing, update cache. + """ + searcher = self.searchers[data_source] + + seeds_with_ids = [] + for seed in seed_data: + query = seed.get("content", "") + if not query: + continue + doc_id = compute_content_hash(str(data_source) + query, "doc-") + seeds_with_ids.append((doc_id, seed)) + + if not seeds_with_ids: + return [] + + doc_ids = [doc_id for doc_id, _ in seeds_with_ids] + cached_results = self.search_storage.get_by_ids(doc_ids) + + to_search_seeds = [] + final_results = [] + + for (doc_id, seed), cached in zip(seeds_with_ids, cached_results): + if cached is not None: + if "_doc_id" not in cached: + cached["_doc_id"] = doc_id + final_results.append(cached) + else: + to_search_seeds.append(seed) + + if to_search_seeds: + new_results = run_concurrent( + partial( + self._perform_search, searcher_obj=searcher, data_source=data_source + ), + to_search_seeds, + desc=f"Searching {data_source} database", + unit="keyword", + ) + new_results = [res for res in new_results if res is not None] + + if new_results: + upsert_data = {res["_doc_id"]: res for res in new_results} + self.search_storage.upsert(upsert_data) + logger.info( + f"Saved {len(upsert_data)} new results to {data_source} cache" + ) + + final_results.extend(new_results) + + return final_results + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + docs = batch.to_dict(orient="records") + + self._init_searchers() + + seed_data = [doc for doc in docs if doc and "content" in doc] + + if not seed_data: + logger.warning("No valid seeds in batch") + return pd.DataFrame([]) + + all_results = [] + + for data_source in self.data_sources: + if data_source not in self.searchers: + logger.error(f"Data source {data_source} not initialized, skipping") + continue + + source_results = self._process_single_source(data_source, seed_data) + all_results.extend(source_results) + + if not all_results: + logger.warning("No search results generated for this batch") + + return pd.DataFrame(all_results)