Skip to content

Commit

Permalink
initial MVP structured RAG implementation (#569)
Browse files Browse the repository at this point in the history
this package begins the MVP structured RAG implmentation. It reads and
writes a single json file for the memory. With many contributions from
Umesh, who has been team coding with me
  • Loading branch information
steveluc authored Jan 17, 2025
1 parent c4d7013 commit 11a600f
Show file tree
Hide file tree
Showing 9 changed files with 693 additions and 0 deletions.
21 changes: 21 additions & 0 deletions ts/packages/knowPro/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) Microsoft Corporation.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
20 changes: 20 additions & 0 deletions ts/packages/knowPro/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# knowpro

Working toward a shared understanding of the MVP for structured RAG.

**Sample code** and early ideas that explore how to:

- Use TypeChat to extract **_structured knowledge_** from conversations, transcripts and documents.
- Implement a **Structured RAG**. The exploration includes ideas on how to:
- Index the extracted knowledge and source text for retrieval.
- Use TypeChat to generate **structured queries** for structured knowledge.\*\*
- Use structured objects to **_answer_** questions.
- Use structured indices to improve precision.

## Trademarks

This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.
44 changes: 44 additions & 0 deletions ts/packages/knowPro/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"name": "knowpro",
"version": "0.0.1",
"description": "Basic structured RAG implementation",
"homepage": "https://github.com/microsoft/TypeAgent#readme",
"repository": {
"type": "git",
"url": "https://github.com/microsoft/TypeAgent.git",
"directory": "ts/packages/knowPro"
},
"license": "MIT",
"author": "Microsoft",
"type": "module",
"exports": {
".": "./dist/index.js"
},
"types": "./dist/index.d.ts",
"scripts": {
"build": "npm run tsc",
"postbuild": "copyfiles -u 1 \"src/**/*Schema*.ts\" dist",
"clean": "rimraf --glob dist *.tsbuildinfo *.done.build.log",
"prettier": "prettier --check . --ignore-path ../../.prettierignore",
"prettier:fix": "prettier --write . --ignore-path ../../.prettierignore",
"tsc": "tsc -b"
},
"dependencies": {
"aiclient": "workspace:*",
"debug": "^4.3.4",
"knowledge-processor": "workspace:*",
"typeagent": "workspace:*",
"typechat": "^0.1.1"
},
"devDependencies": {
"@types/debug": "^4.1.10",
"@types/jest": "^29.5.7",
"@types/node": "^18.18.7",
"copyfiles": "^2.4.1",
"dotenv": "^16.3.1",
"jest": "^29.7.0",
"prettier": "^3.2.5",
"rimraf": "^5.0.5",
"typescript": "^5.4.2"
}
}
224 changes: 224 additions & 0 deletions ts/packages/knowPro/src/conversationIndex.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Copyright (c) Microsoft Corporation and Henry Lucco.
// Licensed under the MIT License.

import {
ITermToSemanticRefIndex,
ITermToSemanticRefIndexData,
ITermToSemanticRefIndexItem,
ScoredSemanticRef,
IConversation,
IKnowledgeSource,
SemanticRef,
ITopic,
TextRange,
TextLocation,
} from "./dataFormat.js";
import { conversation } from "knowledge-processor";
import { openai } from "aiclient";

function addFacet(
facet: conversation.Facet | undefined,
refIndex: number,
semanticRefIndex: ITermToSemanticRefIndex,
) {
if (facet !== undefined) {
semanticRefIndex.addTerm(facet.name, refIndex);
if (facet.value !== undefined && typeof facet.value === "string") {
semanticRefIndex.addTerm(facet.value, refIndex);
}
}
}

function textLocationFromLocation(
messageIndex: number,
chunkIndex = 0,
): TextLocation {
return { messageIndex, chunkIndex };
}

function textRangeFromLocation(
messageIndex: number,
chunkIndex = 0,
): TextRange {
return {
start: textLocationFromLocation(messageIndex, chunkIndex),
end: undefined,
};
}

export function addEntityToIndex(
entity: conversation.ConcreteEntity,
semanticRefs: SemanticRef[],
semanticRefIndex: ITermToSemanticRefIndex,
messageIndex: number,
chunkIndex = 0,
) {
semanticRefs.push({
range: textRangeFromLocation(messageIndex, chunkIndex),
knowledgeType: "entity",
knowledge: entity,
});
const refIndex = semanticRefs.length - 1;
semanticRefIndex.addTerm(entity.name, refIndex);
// add each type as a separate term
for (const type of entity.type) {
semanticRefIndex.addTerm(type, refIndex);
}
// add every facet name as a separate term
if (entity.facets) {
for (const facet of entity.facets) {
addFacet(facet, refIndex, semanticRefIndex);
}
}
}

export function addTopicToIndex(
topic: ITopic,
semanticRefs: SemanticRef[],
semanticRefIndex: ITermToSemanticRefIndex,
messageIndex: number,
chunkIndex = 0,
) {
semanticRefs.push({
range: textRangeFromLocation(messageIndex, chunkIndex),
knowledgeType: "topic",
knowledge: topic,
});
const refIndex = semanticRefs.length - 1;
semanticRefIndex.addTerm(topic.text, refIndex);
}

export function addActionToIndex(
action: conversation.Action,
semanticRefs: SemanticRef[],
semanticRefIndex: ITermToSemanticRefIndex,
messageIndex: number,
chunkIndex = 0,
) {
semanticRefs.push({
range: textRangeFromLocation(messageIndex, chunkIndex),
knowledgeType: "action",
knowledge: action,
});
const refIndex = semanticRefs.length - 1;
semanticRefIndex.addTerm(action.verbs.join(" "), refIndex);
if (action.subjectEntityName !== "none") {
semanticRefIndex.addTerm(action.subjectEntityName, refIndex);
}
if (action.objectEntityName !== "none") {
semanticRefIndex.addTerm(action.objectEntityName, refIndex);
}
if (action.indirectObjectEntityName !== "none") {
semanticRefIndex.addTerm(action.indirectObjectEntityName, refIndex);
}
if (action.params) {
for (const param of action.params) {
if (typeof param === "string") {
semanticRefIndex.addTerm(param, refIndex);
} else {
semanticRefIndex.addTerm(param.name, refIndex);
if (typeof param.value === "string") {
semanticRefIndex.addTerm(param.value, refIndex);
}
}
}
}
addFacet(action.subjectEntityFacet, refIndex, semanticRefIndex);
}

export async function buildConversationIndex<TMeta extends IKnowledgeSource>(
convo: IConversation<TMeta>,
) {
const semanticRefIndex = new ConversationIndex();
convo.semanticRefIndex = semanticRefIndex;
if (convo.semanticRefs === undefined) {
convo.semanticRefs = [];
}
const semanticRefs = convo.semanticRefs;
const chatModelSettings = openai.apiSettingsFromEnv(
openai.ModelType.Chat,
undefined,
"GPT_4_O",
);
chatModelSettings.retryPauseMs = 10000;
const chatModel = openai.createJsonChatModel(chatModelSettings, [
"chatExtractor",
]);
const extractor = conversation.createKnowledgeExtractor(chatModel, {
maxContextLength: 4096,
mergeActionKnowledge: false,
});

for (let i = 0; i < convo.messages.length; i++) {
const msg = convo.messages[i];
// only one chunk per message for now
const text = msg.textChunks[0];
const knowledge = await extractor.extract(text).catch((err) => {
console.log(`Error extracting knowledge: ${err}`);
return undefined;
});
if (knowledge) {
for (const entity of knowledge.entities) {
addEntityToIndex(entity, semanticRefs, semanticRefIndex, i);
}
for (const action of knowledge.actions) {
addActionToIndex(action, semanticRefs, semanticRefIndex, i);
}
for (const inverseAction of knowledge.inverseActions) {
addActionToIndex(
inverseAction,
semanticRefs,
semanticRefIndex,
i,
);
}
for (const topic of knowledge.topics) {
const topicObj: ITopic = { text: topic };
addTopicToIndex(topicObj, semanticRefs, semanticRefIndex, i);
}
}
}
}

export class ConversationIndex implements ITermToSemanticRefIndex {
map: Map<string, ScoredSemanticRef[]> = new Map<
string,
ScoredSemanticRef[]
>();

addTerm(term: string, semanticRefResult: number | ScoredSemanticRef): void {
if (typeof semanticRefResult === "number") {
semanticRefResult = {
semanticRefIndex: semanticRefResult,
score: 1,
};
}
if (this.map.has(term)) {
this.map.get(term)?.push(semanticRefResult);
} else {
this.map.set(term, [semanticRefResult]);
}
}

lookupTerm(term: string, fuzzy = false): ScoredSemanticRef[] {
return this.map.get(term) ?? [];
}

removeTerm(term: string, semanticRefIndex: number): void {
this.map.delete(term);
}

removeTermIfEmpty(term: string): void {
if (this.map.has(term) && this.map.get(term)?.length === 0) {
this.map.delete(term);
}
}

serialize(): ITermToSemanticRefIndexData {
const items: ITermToSemanticRefIndexItem[] = [];
for (const [term, semanticRefIndices] of this.map) {
items.push({ term, semanticRefIndices });
}
return { items };
}
}
Loading

0 comments on commit 11a600f

Please sign in to comment.