-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
initial MVP structured RAG implementation (#569)
this package begins the MVP structured RAG implmentation. It reads and writes a single json file for the memory. With many contributions from Umesh, who has been team coding with me
- Loading branch information
Showing
9 changed files
with
693 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) Microsoft Corporation. | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# knowpro | ||
|
||
Working toward a shared understanding of the MVP for structured RAG. | ||
|
||
**Sample code** and early ideas that explore how to: | ||
|
||
- Use TypeChat to extract **_structured knowledge_** from conversations, transcripts and documents. | ||
- Implement a **Structured RAG**. The exploration includes ideas on how to: | ||
- Index the extracted knowledge and source text for retrieval. | ||
- Use TypeChat to generate **structured queries** for structured knowledge.\*\* | ||
- Use structured objects to **_answer_** questions. | ||
- Use structured indices to improve precision. | ||
|
||
## Trademarks | ||
|
||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft | ||
trademarks or logos is subject to and must follow | ||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). | ||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. | ||
Any use of third-party trademarks or logos are subject to those third-party's policies. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
{ | ||
"name": "knowpro", | ||
"version": "0.0.1", | ||
"description": "Basic structured RAG implementation", | ||
"homepage": "https://github.com/microsoft/TypeAgent#readme", | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/microsoft/TypeAgent.git", | ||
"directory": "ts/packages/knowPro" | ||
}, | ||
"license": "MIT", | ||
"author": "Microsoft", | ||
"type": "module", | ||
"exports": { | ||
".": "./dist/index.js" | ||
}, | ||
"types": "./dist/index.d.ts", | ||
"scripts": { | ||
"build": "npm run tsc", | ||
"postbuild": "copyfiles -u 1 \"src/**/*Schema*.ts\" dist", | ||
"clean": "rimraf --glob dist *.tsbuildinfo *.done.build.log", | ||
"prettier": "prettier --check . --ignore-path ../../.prettierignore", | ||
"prettier:fix": "prettier --write . --ignore-path ../../.prettierignore", | ||
"tsc": "tsc -b" | ||
}, | ||
"dependencies": { | ||
"aiclient": "workspace:*", | ||
"debug": "^4.3.4", | ||
"knowledge-processor": "workspace:*", | ||
"typeagent": "workspace:*", | ||
"typechat": "^0.1.1" | ||
}, | ||
"devDependencies": { | ||
"@types/debug": "^4.1.10", | ||
"@types/jest": "^29.5.7", | ||
"@types/node": "^18.18.7", | ||
"copyfiles": "^2.4.1", | ||
"dotenv": "^16.3.1", | ||
"jest": "^29.7.0", | ||
"prettier": "^3.2.5", | ||
"rimraf": "^5.0.5", | ||
"typescript": "^5.4.2" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
// Copyright (c) Microsoft Corporation and Henry Lucco. | ||
// Licensed under the MIT License. | ||
|
||
import { | ||
ITermToSemanticRefIndex, | ||
ITermToSemanticRefIndexData, | ||
ITermToSemanticRefIndexItem, | ||
ScoredSemanticRef, | ||
IConversation, | ||
IKnowledgeSource, | ||
SemanticRef, | ||
ITopic, | ||
TextRange, | ||
TextLocation, | ||
} from "./dataFormat.js"; | ||
import { conversation } from "knowledge-processor"; | ||
import { openai } from "aiclient"; | ||
|
||
function addFacet( | ||
facet: conversation.Facet | undefined, | ||
refIndex: number, | ||
semanticRefIndex: ITermToSemanticRefIndex, | ||
) { | ||
if (facet !== undefined) { | ||
semanticRefIndex.addTerm(facet.name, refIndex); | ||
if (facet.value !== undefined && typeof facet.value === "string") { | ||
semanticRefIndex.addTerm(facet.value, refIndex); | ||
} | ||
} | ||
} | ||
|
||
function textLocationFromLocation( | ||
messageIndex: number, | ||
chunkIndex = 0, | ||
): TextLocation { | ||
return { messageIndex, chunkIndex }; | ||
} | ||
|
||
function textRangeFromLocation( | ||
messageIndex: number, | ||
chunkIndex = 0, | ||
): TextRange { | ||
return { | ||
start: textLocationFromLocation(messageIndex, chunkIndex), | ||
end: undefined, | ||
}; | ||
} | ||
|
||
export function addEntityToIndex( | ||
entity: conversation.ConcreteEntity, | ||
semanticRefs: SemanticRef[], | ||
semanticRefIndex: ITermToSemanticRefIndex, | ||
messageIndex: number, | ||
chunkIndex = 0, | ||
) { | ||
semanticRefs.push({ | ||
range: textRangeFromLocation(messageIndex, chunkIndex), | ||
knowledgeType: "entity", | ||
knowledge: entity, | ||
}); | ||
const refIndex = semanticRefs.length - 1; | ||
semanticRefIndex.addTerm(entity.name, refIndex); | ||
// add each type as a separate term | ||
for (const type of entity.type) { | ||
semanticRefIndex.addTerm(type, refIndex); | ||
} | ||
// add every facet name as a separate term | ||
if (entity.facets) { | ||
for (const facet of entity.facets) { | ||
addFacet(facet, refIndex, semanticRefIndex); | ||
} | ||
} | ||
} | ||
|
||
export function addTopicToIndex( | ||
topic: ITopic, | ||
semanticRefs: SemanticRef[], | ||
semanticRefIndex: ITermToSemanticRefIndex, | ||
messageIndex: number, | ||
chunkIndex = 0, | ||
) { | ||
semanticRefs.push({ | ||
range: textRangeFromLocation(messageIndex, chunkIndex), | ||
knowledgeType: "topic", | ||
knowledge: topic, | ||
}); | ||
const refIndex = semanticRefs.length - 1; | ||
semanticRefIndex.addTerm(topic.text, refIndex); | ||
} | ||
|
||
export function addActionToIndex( | ||
action: conversation.Action, | ||
semanticRefs: SemanticRef[], | ||
semanticRefIndex: ITermToSemanticRefIndex, | ||
messageIndex: number, | ||
chunkIndex = 0, | ||
) { | ||
semanticRefs.push({ | ||
range: textRangeFromLocation(messageIndex, chunkIndex), | ||
knowledgeType: "action", | ||
knowledge: action, | ||
}); | ||
const refIndex = semanticRefs.length - 1; | ||
semanticRefIndex.addTerm(action.verbs.join(" "), refIndex); | ||
if (action.subjectEntityName !== "none") { | ||
semanticRefIndex.addTerm(action.subjectEntityName, refIndex); | ||
} | ||
if (action.objectEntityName !== "none") { | ||
semanticRefIndex.addTerm(action.objectEntityName, refIndex); | ||
} | ||
if (action.indirectObjectEntityName !== "none") { | ||
semanticRefIndex.addTerm(action.indirectObjectEntityName, refIndex); | ||
} | ||
if (action.params) { | ||
for (const param of action.params) { | ||
if (typeof param === "string") { | ||
semanticRefIndex.addTerm(param, refIndex); | ||
} else { | ||
semanticRefIndex.addTerm(param.name, refIndex); | ||
if (typeof param.value === "string") { | ||
semanticRefIndex.addTerm(param.value, refIndex); | ||
} | ||
} | ||
} | ||
} | ||
addFacet(action.subjectEntityFacet, refIndex, semanticRefIndex); | ||
} | ||
|
||
export async function buildConversationIndex<TMeta extends IKnowledgeSource>( | ||
convo: IConversation<TMeta>, | ||
) { | ||
const semanticRefIndex = new ConversationIndex(); | ||
convo.semanticRefIndex = semanticRefIndex; | ||
if (convo.semanticRefs === undefined) { | ||
convo.semanticRefs = []; | ||
} | ||
const semanticRefs = convo.semanticRefs; | ||
const chatModelSettings = openai.apiSettingsFromEnv( | ||
openai.ModelType.Chat, | ||
undefined, | ||
"GPT_4_O", | ||
); | ||
chatModelSettings.retryPauseMs = 10000; | ||
const chatModel = openai.createJsonChatModel(chatModelSettings, [ | ||
"chatExtractor", | ||
]); | ||
const extractor = conversation.createKnowledgeExtractor(chatModel, { | ||
maxContextLength: 4096, | ||
mergeActionKnowledge: false, | ||
}); | ||
|
||
for (let i = 0; i < convo.messages.length; i++) { | ||
const msg = convo.messages[i]; | ||
// only one chunk per message for now | ||
const text = msg.textChunks[0]; | ||
const knowledge = await extractor.extract(text).catch((err) => { | ||
console.log(`Error extracting knowledge: ${err}`); | ||
return undefined; | ||
}); | ||
if (knowledge) { | ||
for (const entity of knowledge.entities) { | ||
addEntityToIndex(entity, semanticRefs, semanticRefIndex, i); | ||
} | ||
for (const action of knowledge.actions) { | ||
addActionToIndex(action, semanticRefs, semanticRefIndex, i); | ||
} | ||
for (const inverseAction of knowledge.inverseActions) { | ||
addActionToIndex( | ||
inverseAction, | ||
semanticRefs, | ||
semanticRefIndex, | ||
i, | ||
); | ||
} | ||
for (const topic of knowledge.topics) { | ||
const topicObj: ITopic = { text: topic }; | ||
addTopicToIndex(topicObj, semanticRefs, semanticRefIndex, i); | ||
} | ||
} | ||
} | ||
} | ||
|
||
export class ConversationIndex implements ITermToSemanticRefIndex { | ||
map: Map<string, ScoredSemanticRef[]> = new Map< | ||
string, | ||
ScoredSemanticRef[] | ||
>(); | ||
|
||
addTerm(term: string, semanticRefResult: number | ScoredSemanticRef): void { | ||
if (typeof semanticRefResult === "number") { | ||
semanticRefResult = { | ||
semanticRefIndex: semanticRefResult, | ||
score: 1, | ||
}; | ||
} | ||
if (this.map.has(term)) { | ||
this.map.get(term)?.push(semanticRefResult); | ||
} else { | ||
this.map.set(term, [semanticRefResult]); | ||
} | ||
} | ||
|
||
lookupTerm(term: string, fuzzy = false): ScoredSemanticRef[] { | ||
return this.map.get(term) ?? []; | ||
} | ||
|
||
removeTerm(term: string, semanticRefIndex: number): void { | ||
this.map.delete(term); | ||
} | ||
|
||
removeTermIfEmpty(term: string): void { | ||
if (this.map.has(term) && this.map.get(term)?.length === 0) { | ||
this.map.delete(term); | ||
} | ||
} | ||
|
||
serialize(): ITermToSemanticRefIndexData { | ||
const items: ITermToSemanticRefIndexItem[] = []; | ||
for (const [term, semanticRefIndices] of this.map) { | ||
items.push({ term, semanticRefIndices }); | ||
} | ||
return { items }; | ||
} | ||
} |
Oops, something went wrong.