From 45f03fc43754cb7ec64d6786dc5b832f5f7cd78a Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Fri, 13 Dec 2024 12:08:07 -0800 Subject: [PATCH] Podcasts and Threads (#493) **Threads** * Experimental support for Thread Index * Define a Thread by Time Range and Description * Thread descriptions are indexed and matched during search processing to find time ranges * Time range used to filter matches * E.g. For multiple episodes of a podcast in single index **Relevance Improvements** **Bug fixes** **Unit tests** --- ts/examples/chat/src/memory/chatMemory.ts | 4 + .../chat/src/memory/chatMemoryPrinter.ts | 22 +++- ts/examples/chat/src/memory/common.ts | 4 +- ts/examples/chat/src/memory/emailMemory.ts | 6 +- ts/examples/chat/src/memory/podcastMemory.ts | 84 +++++++++++++- .../src/conversation/conversation.ts | 21 ++++ .../src/conversation/entities.ts | 23 ++-- .../src/conversation/index.ts | 1 + .../src/conversation/knowledgeActions.ts | 19 ++- .../src/conversation/searchProcessor.ts | 30 +++++ .../src/conversation/threads.ts | 108 ++++++++++++++++++ .../src/conversation/topics.ts | 4 +- .../src/conversation/transcript.ts | 92 +++++++++++++++ .../knowledgeProcessor/src/email/email.ts | 73 +++++++----- .../knowledgeProcessor/src/temporal.ts | 34 +++++- .../test/transcript.spec.ts | 24 ++++ 16 files changed, 485 insertions(+), 64 deletions(-) create mode 100644 ts/packages/knowledgeProcessor/src/conversation/threads.ts create mode 100644 ts/packages/knowledgeProcessor/test/transcript.spec.ts diff --git a/ts/examples/chat/src/memory/chatMemory.ts b/ts/examples/chat/src/memory/chatMemory.ts index 96b1c5b44..fe0d71e17 100644 --- a/ts/examples/chat/src/memory/chatMemory.ts +++ b/ts/examples/chat/src/memory/chatMemory.ts @@ -1158,6 +1158,7 @@ export async function runChatMemory(): Promise { def.options.skipEntities = argBool("Skip entity matching", false); def.options.skipActions = argBool("Skip action matching", false); def.options.skipTopics = argBool("Skip topics matching", false); + def.options.threads = argBool("Use most likely thread", false); return def; } commands.search.metadata = searchDef(); @@ -1333,6 +1334,9 @@ export async function runChatMemory(): Promise { if (namedArgs.fallback) { searchOptions.fallbackSearch = { maxMatches: 10 }; } + if (namedArgs.threads) { + searchOptions.threadSearch = { maxMatches: 1, minScore: 0.8 }; + } if (!namedArgs.eval) { // just translate user query into structured query without eval const translationContext = await context.searcher.buildContext( diff --git a/ts/examples/chat/src/memory/chatMemoryPrinter.ts b/ts/examples/chat/src/memory/chatMemoryPrinter.ts index 739f5b29f..3b268e403 100644 --- a/ts/examples/chat/src/memory/chatMemoryPrinter.ts +++ b/ts/examples/chat/src/memory/chatMemoryPrinter.ts @@ -297,17 +297,29 @@ export class ChatMemoryPrinter extends ChatPrinter { } } + public writeSearchQuestion( + result: + | conversation.SearchTermsActionResponse + | conversation.SearchTermsActionResponseV2 + | undefined, + debug: boolean = false, + ) { + if (result) { + const question = getSearchQuestion(result); + if (question) { + this.writeInColor(chalk.cyanBright, `Question: ${question}`); + this.writeLine(); + } + } + } + public writeSearchTermsResult( result: | conversation.SearchTermsActionResponse | conversation.SearchTermsActionResponseV2, debug: boolean = false, ) { - const question = getSearchQuestion(result); - if (question) { - this.writeInColor(chalk.cyanBright, `Question: ${question}`); - this.writeLine(); - } + this.writeSearchQuestion(result); if (result.response && result.response.answer) { this.writeResultStats(result.response); if (result.response.answer.answer) { diff --git a/ts/examples/chat/src/memory/common.ts b/ts/examples/chat/src/memory/common.ts index 45f9896d5..8b65d455d 100644 --- a/ts/examples/chat/src/memory/common.ts +++ b/ts/examples/chat/src/memory/common.ts @@ -135,10 +135,10 @@ export function argClean(defaultValue = false): ArgDef { }; } -export function argPause(): ArgDef { +export function argPause(defaultValue = 0): ArgDef { return { type: "number", - defaultValue: 0, + defaultValue, description: "Pause for given milliseconds after each iteration", }; } diff --git a/ts/examples/chat/src/memory/emailMemory.ts b/ts/examples/chat/src/memory/emailMemory.ts index 01c679083..0a9cbc10b 100644 --- a/ts/examples/chat/src/memory/emailMemory.ts +++ b/ts/examples/chat/src/memory/emailMemory.ts @@ -43,7 +43,7 @@ import { import chalk from "chalk"; import { convertMsgFiles } from "./importer.js"; import fs from "fs"; -import { Result, success } from "typechat"; +import { error, Result, success } from "typechat"; export async function createEmailMemory( models: Models, @@ -341,7 +341,11 @@ export function createEmailCommands( options, previousUserInputs, ); + if (!searchResults) { + return error("No search results"); + } context.printer.writeLine(); + context.printer.writeSearchQuestion(searchResults); context.printer.writeResultStats(searchResults?.response); context.printer.writeLine(); return success(searchResults); diff --git a/ts/examples/chat/src/memory/podcastMemory.ts b/ts/examples/chat/src/memory/podcastMemory.ts index 0a990c013..d6f76f0b4 100644 --- a/ts/examples/chat/src/memory/podcastMemory.ts +++ b/ts/examples/chat/src/memory/podcastMemory.ts @@ -25,6 +25,7 @@ import { } from "./common.js"; import path from "path"; import { + asyncArray, createWorkQueueFolder, ensureDir, getFileName, @@ -74,7 +75,10 @@ export function createPodcastCommands( ): void { commands.importPodcast = importPodcast; commands.podcastConvert = podcastConvert; - commands.prodcastIndex = podcastIndex; + commands.podcastIndex = podcastIndex; + commands.podcastAddThread = podcastAddThread; + commands.podcastListThreads = podcastListThreads; + //----------- // COMMANDS //--------- @@ -83,16 +87,19 @@ export function createPodcastCommands( description: "Import a podcast transcript.", args: { sourcePath: argSourceFileOrFolder(), + name: arg("Thread name"), + description: arg("Thread description"), }, options: { startAt: arg("Start date and time"), length: argNum("Length of the podcast in minutes", 60), clean: argClean(), maxTurns: argNum("Max turns"), - pauseMs: argPause(), + pauseMs: argPause(1000), }, }; } + commands.importPodcast.metadata = importPodcastDef(); async function importPodcast(args: string[]): Promise { const namedArgs = parseNamedArguments(args, importPodcastDef()); let sourcePath: string = namedArgs.sourcePath; @@ -102,6 +109,7 @@ export function createPodcastCommands( } await podcastConvert(namedArgs); + await podcastAddThread(namedArgs); const turnsFilePath = getTurnsFolderPath(sourcePath); namedArgs.sourcePath = turnsFilePath; await podcastIndex(namedArgs); @@ -139,11 +147,11 @@ export function createPodcastCommands( options: { clean: argClean(), maxTurns: argNum("Max turns"), - pauseMs: argPause(), + pauseMs: argPause(1000), }, }; } - commands.importPodcast.metadata = podcastIndexDef(); + commands.podcastIndex.metadata = podcastIndexDef(); async function podcastIndex(args: string[] | NamedArgs) { const namedArgs = parseNamedArguments(args, podcastIndexDef()); let sourcePath: string = namedArgs.sourcePath; @@ -159,6 +167,66 @@ export function createPodcastCommands( context.printer.writeError(`${sourcePath} is not a directory`); } } + + function podcastAddThreadDef(): CommandMetadata { + return { + description: "Add a sub-thread to the podcast index", + args: { + sourcePath: argSourceFileOrFolder(), + name: arg("Thread name"), + description: arg("Thread description"), + }, + options: { + startAt: arg("Start date and time"), + length: argNum("Length of the podcast in minutes", 60), + }, + }; + } + commands.podcastAddThread.metadata = podcastAddThreadDef(); + async function podcastAddThread(args: string[] | NamedArgs): Promise { + const namedArgs = parseNamedArguments(args, podcastConvertDef()); + const sourcePath = namedArgs.sourcePath; + const timeRange = conversation.parseTranscriptDuration( + namedArgs.startAt, + namedArgs.length, + ); + if (!timeRange) { + context.printer.writeError("Time range required"); + return; + } + const turns = + await conversation.loadTurnsFromTranscriptFile(sourcePath); + const metadata: conversation.TranscriptMetadata = { + sourcePath, + name: namedArgs.name, + description: namedArgs.description, + startAt: namedArgs.startAt, + lengthMinutes: namedArgs.length, + }; + const overview = conversation.createTranscriptOverview(metadata, turns); + const threadDef: conversation.ThreadTimeRange = { + type: "temporal", + description: overview, + timeRange, + }; + const threads = + await context.podcastMemory.conversation.getThreadIndex(); + await threads.add(threadDef); + writeThread(threadDef); + } + commands.podcastListThreads.metadata = "List all registered threads"; + async function podcastListThreads(args: string[]) { + const threads = + await context.podcastMemory.conversation.getThreadIndex(); + const allThreads: conversation.ConversationThread[] = + await asyncArray.toArray(threads.entries()); + for (let i = 0; i < allThreads.length; ++i) { + const t = allThreads[i]; + context.printer.writeLine(`[${i}]`); + writeThread(t); + } + } + return; //--- @@ -249,4 +317,12 @@ export function createPodcastCommands( `${context.podcastMemory.conversationName}_stats.json`, ); } + + function writeThread(t: conversation.ConversationThread) { + context.printer.writeLine(t.description); + const range = conversation.toDateRange(t.timeRange); + context.printer.writeLine(range.startDate.toISOString()); + context.printer.writeLine(range.stopDate!.toISOString()); + context.printer.writeLine(); + } } diff --git a/ts/packages/knowledgeProcessor/src/conversation/conversation.ts b/ts/packages/knowledgeProcessor/src/conversation/conversation.ts index 468589182..7372d906f 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/conversation.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/conversation.ts @@ -57,6 +57,7 @@ import { StorageProvider, } from "../storageProvider.js"; import { RecentItems, createRecentItemsWindow } from "../temporal.js"; +import { createThreadIndexOnStorage, ThreadIndex } from "./threads.js"; export interface ConversationSettings { indexSettings: TextIndexSettings; @@ -147,6 +148,7 @@ export interface Conversation< * Returns the index of */ getActionIndex(): Promise>; + getThreadIndex(): Promise>; /** * * @param removeMessages If you want the original messages also removed. Set to false if you just want to rebuild the indexes @@ -279,6 +281,8 @@ export async function createConversation( let entityIndex: EntityIndex | undefined; const actionPath = path.join(rootPath, "actions"); let actionIndex: ActionIndex | undefined; + const threadsPath = path.join(rootPath, "threads"); + let threadIndex: ThreadIndex | undefined; const thisConversation: Conversation = { settings, @@ -288,6 +292,7 @@ export async function createConversation( getEntityIndex, getTopicsIndex, getActionIndex, + getThreadIndex, clear, addMessage, addKnowledgeForMessage, @@ -344,6 +349,22 @@ export async function createConversation( return actionIndex; } + async function getThreadIndex(): Promise { + if (!threadIndex) { + // Using file provider until stable + const provider = createFileSystemStorageProvider( + rootPath, + folderSettings, + fSys, + ); + threadIndex = await createThreadIndexOnStorage( + threadsPath, + provider, + ); + } + return threadIndex; + } + async function getTopicsIndex(level?: number): Promise { const name = topicsName(level); let topicIndex = topics.get(name); diff --git a/ts/packages/knowledgeProcessor/src/conversation/entities.ts b/ts/packages/knowledgeProcessor/src/conversation/entities.ts index f6c154636..de98c4119 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/entities.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/entities.ts @@ -380,12 +380,12 @@ export async function createEntityIndexOnStorage( terms = terms.filter((t) => !noiseTerms.has(t)); if (terms && terms.length > 0) { - const hitCounter = createHitTable(); + const entityIdHitTable = createHitTable(); const scoreBoost = terms.length; await Promise.all([ nameIndex.getNearestHitsMultiple( terms, - hitCounter, + entityIdHitTable, options.nameSearchOptions?.maxMatches ?? options.maxMatches, options.nameSearchOptions?.minScore ?? options.minScore, scoreBoost, @@ -393,23 +393,26 @@ export async function createEntityIndexOnStorage( ), typeIndex.getNearestHitsMultiple( terms, - hitCounter, + entityIdHitTable, options.maxMatches, options.minScore, scoreBoost, ), facetIndex.getNearestHitsMultiple( terms, - hitCounter, + entityIdHitTable, options.facetSearchOptions?.maxMatches, options.facetSearchOptions?.minScore ?? options.minScore, ), ]); - let entityHits = hitCounter.getTopK(determineTopK(options)).sort(); + entityIdHitTable.roundScores(2); + let entityIdHits = entityIdHitTable + .getTopK(determineTopK(options)) + .sort(); results.entityIds = [ ...intersectMultiple( - entityHits, + entityIdHits, itemsFromTemporalSequence(results.temporalSequence), ), ]; @@ -472,13 +475,7 @@ export async function createEntityIndexOnStorage( } function determineTopK(options: EntitySearchOptions): number { - const topK = - options.topK ?? - Math.max( - options.maxMatches, - options.nameSearchOptions?.maxMatches ?? 0, - //options.facetSearchOptions?.maxMatches ?? 0, - ); + const topK = options.topK; return topK === undefined || topK < 3 ? 3 : topK; } } diff --git a/ts/packages/knowledgeProcessor/src/conversation/index.ts b/ts/packages/knowledgeProcessor/src/conversation/index.ts index 3faf73131..57ffb2aee 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/index.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/index.ts @@ -18,6 +18,7 @@ export * from "./topics.js"; export * from "./topicSchema.js"; export * from "./transcript.js"; export * from "./actions.js"; +export * from "./threads.js"; export * from "./searchResponse.js"; export * from "./searchProcessor.js"; diff --git a/ts/packages/knowledgeProcessor/src/conversation/knowledgeActions.ts b/ts/packages/knowledgeProcessor/src/conversation/knowledgeActions.ts index 703cdc4c2..c06943c55 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/knowledgeActions.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/knowledgeActions.ts @@ -11,7 +11,7 @@ import { import { createTypeScriptJsonValidator } from "typechat/ts"; import { SearchAction } from "./knowledgeSearchSchema.js"; import { dateTime, loadSchema } from "typeagent"; -import { DateTime, DateTimeRange } from "./dateTimeSchema.js"; +import { DateTime, DateTimeRange, DateVal, TimeVal } from "./dateTimeSchema.js"; import { SearchTermsAction } from "./knowledgeTermSearchSchema.js"; import { SearchTermsActionV2 } from "./knowledgeTermSearchSchema2.js"; @@ -196,3 +196,20 @@ export function dateTimeToDate(dateTime: DateTime): Date { } return dt; } + +export function dateToDateTime(dt: Date): DateTime { + const date: DateVal = { + day: dt.getDate(), + month: dt.getMonth() + 1, + year: dt.getFullYear(), + }; + const time: TimeVal = { + hour: dt.getHours(), + minute: dt.getMinutes(), + seconds: dt.getSeconds(), + }; + return { + date, + time, + }; +} diff --git a/ts/packages/knowledgeProcessor/src/conversation/searchProcessor.ts b/ts/packages/knowledgeProcessor/src/conversation/searchProcessor.ts index 4c10bd59c..67de873e5 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/searchProcessor.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/searchProcessor.ts @@ -55,6 +55,7 @@ export type SearchProcessingOptions = { maxMessages: number; entitySearch?: EntitySearchOptions | undefined; fallbackSearch?: SearchOptions | undefined; + threadSearch?: SearchOptions | undefined; skipAnswerGeneration?: boolean; skipEntitySearch?: boolean; skipTopicSearch?: boolean; @@ -373,6 +374,13 @@ export function createSearchProcessor( options, false, ); + if (options.threadSearch) { + await applyThreadFilters( + action.parameters.question, + action.parameters.filters, + options.threadSearch, + ); + } const response = await conversation.searchTermsV2( action.parameters.filters, searchOptions, @@ -615,6 +623,28 @@ export function createSearchProcessor( } } + async function applyThreadFilters( + query: string, + filters: TermFilterV2[], + options: SearchOptions, + ) { + const threadIndex = await conversation.getThreadIndex(); + const threads = await threadIndex.getNearest( + query, + options.maxMatches, + options.minScore, + ); + if (threads.length === 0) { + return; + } + const thread = threads[0]; + for (const filter of filters) { + if (!filter.timeRange) { + filter.timeRange = thread.timeRange; + } + } + } + function createSearchOptions( topLevelTopicSummary: boolean, options: SearchProcessingOptions, diff --git a/ts/packages/knowledgeProcessor/src/conversation/threads.ts b/ts/packages/knowledgeProcessor/src/conversation/threads.ts new file mode 100644 index 000000000..6491cb52f --- /dev/null +++ b/ts/packages/knowledgeProcessor/src/conversation/threads.ts @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { asyncArray } from "typeagent"; +import { StorageProvider } from "../storageProvider.js"; +import { removeUndefined } from "../setOperations.js"; +import { DateTimeRange } from "./dateTimeSchema.js"; + +export interface ThreadDefinition { + description: string; + type: string; +} + +export interface ThreadTimeRange extends ThreadDefinition { + type: "temporal"; + timeRange: DateTimeRange; +} + +export type ConversationThread = ThreadTimeRange; + +export interface ThreadIndex { + entries(): AsyncIterableIterator; + add(threadDef: ConversationThread): Promise; + getIds(description: string): Promise; + getById(id: TId): Promise; + get(description: string): Promise; + getNearest( + description: string, + maxMatches: number, + minScore?: number, + ): Promise; +} + +export async function createThreadIndexOnStorage( + rootPath: string, + storageProvider: StorageProvider, +): Promise> { + type EntryId = string; + const threadStore = + await storageProvider.createObjectFolder( + rootPath, + "entries", + ); + const textIndex = await storageProvider.createTextIndex( + { caseSensitive: false, semanticIndex: true, concurrency: 1 }, + rootPath, + "description", + "TEXT", + ); + + return { + entries: () => threadStore.allObjects(), + add, + getById, + getIds, + get, + getNearest, + }; + + async function add(threadDef: ConversationThread): Promise { + const entryId = await threadStore.put(threadDef); + await textIndex.put(threadDef.description, [entryId]); + return entryId; + } + + function getById(id: EntryId): Promise { + return threadStore.get(id); + } + + function getIds(description: string): Promise { + return textIndex.get(description); + } + + async function get( + description: string, + ): Promise { + const entryIds = await textIndex.get(description); + if (entryIds && entryIds.length > 0) { + return getByIds(entryIds); + } + return undefined; + } + + async function getNearest( + description: string, + maxMatches: number, + minScore?: number, + ): Promise { + const entryIds = await textIndex.getNearest( + description, + maxMatches, + minScore, + ); + if (entryIds && entryIds.length > 0) { + return getByIds(entryIds); + } + return []; + } + + async function getByIds( + entryIds: EntryId[], + ): Promise { + const threads = await asyncArray.mapAsync(entryIds, 1, (id) => + threadStore.get(id), + ); + return removeUndefined(threads); + } +} diff --git a/ts/packages/knowledgeProcessor/src/conversation/topics.ts b/ts/packages/knowledgeProcessor/src/conversation/topics.ts index 3ff4555c2..2ea0713b1 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/topics.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/topics.ts @@ -321,6 +321,7 @@ export interface TopicSearchOptions extends SearchOptions { sourceNameSearchOptions?: SearchOptions; loadTopics?: boolean | undefined; useHighLevel?: boolean | undefined; + filterBySourceName?: boolean | undefined; } export function createTopicSearchOptions( @@ -710,7 +711,6 @@ export async function createTopicIndexOnStorage< ): Promise> { // We will just use the standard topic stuff for now, since that does the same thing const allTerms = getAllTermsInFilter(filter); - const useSourceName = false; let sourceName = getSubjectFromActionTerm(filter.action); if (!isValidEntityName(sourceName)) { sourceName = undefined; @@ -725,7 +725,7 @@ export async function createTopicIndexOnStorage< return search( topicFilter, options, - useSourceName ? sourceName : undefined, + options.filterBySourceName ? sourceName : undefined, //topics !== "*" ? getAllTermsInFilter(filter, false) : undefined, undefined, possibleIds, diff --git a/ts/packages/knowledgeProcessor/src/conversation/transcript.ts b/ts/packages/knowledgeProcessor/src/conversation/transcript.ts index 7047bc213..0cbbac391 100644 --- a/ts/packages/knowledgeProcessor/src/conversation/transcript.ts +++ b/ts/packages/knowledgeProcessor/src/conversation/transcript.ts @@ -8,6 +8,9 @@ import { ConversationManager, ConversationMessage, } from "./conversationManager.js"; +import { Action, KnowledgeResponse } from "./knowledgeSchema.js"; +import { DateTimeRange } from "./dateTimeSchema.js"; +import { dateToDateTime } from "./knowledgeActions.js"; /** * A turn in a transcript @@ -19,6 +22,11 @@ export type TranscriptTurn = { timestamp?: string | undefined; }; +/** + * Converts a turn from a transcript into a conversation message + * @param turn + * @returns + */ export function transcriptTurnToMessage( turn: TranscriptTurn, ): ConversationMessage { @@ -26,6 +34,40 @@ export function transcriptTurnToMessage( sender: getSpeaker(turn), text: getMessageText(turn, true), timestamp: dateTime.stringToDate(turn.timestamp), + knowledge: transcriptTurnToKnowledge(turn), + }; +} + +enum TurnVerbs { + say = "say", +} + +function transcriptTurnToKnowledge(turn: TranscriptTurn): KnowledgeResponse { + return { + entities: [], + actions: transcriptTurnToActions(turn), + inverseActions: [], + topics: [], + }; +} + +function transcriptTurnToActions(turn: TranscriptTurn): Action[] { + const actions: Action[] = []; + if (turn.speaker && turn.listeners) { + for (const listener of turn.listeners) { + actions.push(createAction(TurnVerbs.say, turn.speaker, listener)); + } + } + return actions; +} + +function createAction(verb: string, from: string, to: string): Action { + return { + verbs: [verb], + verbTense: "past", + subjectEntityName: from, + objectEntityName: "none", + indirectObjectEntityName: to, }; } @@ -230,3 +272,53 @@ function turnToHeaderString(turn: TranscriptTurn): string { } return text; } + +export type TranscriptMetadata = { + sourcePath: string; + name: string; + description?: string | undefined; + startAt?: string; // Should be parseable as a Date + lengthMinutes?: number | undefined; +}; + +export function createTranscriptOverview( + metadata: TranscriptMetadata, + turns: TranscriptTurn[], +): string { + let participantSet = new Set(); + for (const turn of turns) { + let speaker = getSpeaker(turn); + if (speaker) { + participantSet.add(speaker); + } + if (turn.listeners && turn.listeners.length > 0) { + for (const listener of turn.listeners) { + participantSet.add(listener); + } + } + } + let overview = metadata.name; + if (metadata.description) { + overview += "\n"; + overview += metadata.description; + } + const participants = [...participantSet.values()]; + if (participants.length > 0) { + overview += "\nParticipants:\n"; + overview += participants.join(", "); + } + return overview; +} + +export function parseTranscriptDuration( + startAt: string, + lengthMinutes: number, +): DateTimeRange { + const startDate = dateTime.stringToDate(startAt)!; + const offsetMs = lengthMinutes * 60 * 1000; + const stopDate = new Date(startDate.getTime() + offsetMs); + return { + startDate: dateToDateTime(startDate), + stopDate: dateToDateTime(stopDate), + }; +} diff --git a/ts/packages/knowledgeProcessor/src/email/email.ts b/ts/packages/knowledgeProcessor/src/email/email.ts index 4123c4382..a459481dc 100644 --- a/ts/packages/knowledgeProcessor/src/email/email.ts +++ b/ts/packages/knowledgeProcessor/src/email/email.ts @@ -28,6 +28,49 @@ import { KnownEntityTypes } from "../conversation/knowledge.js"; import { StorageProvider } from "../storageProvider.js"; import { createEntitySearchOptions } from "../conversation/entities.js"; +/** + * Convert an email to a conversation message + * Includes an knowledge that can be automatically extracted from the message + * @param email + * @returns + */ +export function emailToMessage(email: Email): ConversationMessage { + const sender = email.from.displayName; + return { + header: emailHeadersToString(email), + text: emailToTextBlock(email, false), + knowledge: emailToKnowledge(email), + timestamp: dateTime.stringToDate(email.sentOn), + sender, + }; +} + +/** + * Convert an email to multiple conversation messages. + * Large emails are broken into sub-messages. + * @param email + * @param maxCharsPerChunk + * @returns + */ +export function emailToMessages( + email: Email, + maxCharsPerChunk?: number | undefined, +): ConversationMessage[] { + if (!isValidChunkSize(maxCharsPerChunk)) { + return [emailToMessage(email)]; + } + + const messages: ConversationMessage[] = []; + const text = email.body; + for (const chunk of splitLargeTextIntoChunks(text, maxCharsPerChunk!)) { + const emailChunk: Email = { ...email }; + emailChunk.body = chunk; + messages.push(emailToMessage(emailChunk)); + } + + return messages; +} + export function emailAddressToString(address: EmailAddress): string { if (address.displayName) { return address.address @@ -399,36 +442,6 @@ export async function addEmailFileToConversation( return false; } -export function emailToMessage(email: Email): ConversationMessage { - const sender = email.from.displayName; - return { - header: emailHeadersToString(email), - text: emailToTextBlock(email, false), - knowledge: emailToKnowledge(email), - timestamp: dateTime.stringToDate(email.sentOn), - sender, - }; -} - -export function emailToMessages( - email: Email, - maxCharsPerChunk?: number | undefined, -): ConversationMessage[] { - if (!isValidChunkSize(maxCharsPerChunk)) { - return [emailToMessage(email)]; - } - - const messages: ConversationMessage[] = []; - const text = email.body; - for (const chunk of splitLargeTextIntoChunks(text, maxCharsPerChunk!)) { - const emailChunk: Email = { ...email }; - emailChunk.body = chunk; - messages.push(emailToMessage(emailChunk)); - } - - return messages; -} - function makeHeader(name: string, text: string | undefined): string { if (text) { return `${name}: ${text}\n`; diff --git a/ts/packages/knowledgeProcessor/src/temporal.ts b/ts/packages/knowledgeProcessor/src/temporal.ts index 809e11ba1..9657fdd61 100644 --- a/ts/packages/knowledgeProcessor/src/temporal.ts +++ b/ts/packages/knowledgeProcessor/src/temporal.ts @@ -19,7 +19,11 @@ import { pathToFileURL } from "url"; import path from "path"; import { valueToString } from "./text.js"; +export type TemporalLogSettings = { + concurrency: number; +}; /** + * A mutable log of timestamped items * @template TId the type of the log entry Id * @template T type of object stored in the log */ @@ -49,10 +53,14 @@ export interface TemporalLog { getUrl?: (id: TId) => URL; } -export type TemporalLogSettings = { - concurrency: number; -}; - +/** + * Create a temporal log using files + * @param settings + * @param folderPath + * @param folderSettings + * @param fSys + * @returns + */ export async function createTemporalLog( settings: TemporalLogSettings, folderPath: string, @@ -268,21 +276,35 @@ export function getRangeOfTemporalSequence( stopDate: sequence[sequence.length - 1].timestamp, }; } + +/** + * A window of recent items + */ export interface RecentItems { - readonly entries: collections.CircularArray; + /** + * Returns all recent entries, ordered by most recent first + */ + getEntries(): T[]; push(items: T | T[]): void; getContext(maxContextLength: number): string[]; getUnique(): T[]; reset(): void; } +/** + * Create a 'window' to track the most recent items in a "stream" + * Uses a circular array + * @param windowSize + * @param stringify + * @returns + */ export function createRecentItemsWindow( windowSize: number, stringify?: (value: T) => string, ): RecentItems { const entries = new collections.CircularArray(windowSize); return { - entries, + getEntries: () => entries.getEntries(), push, getContext, getUnique, diff --git a/ts/packages/knowledgeProcessor/test/transcript.spec.ts b/ts/packages/knowledgeProcessor/test/transcript.spec.ts new file mode 100644 index 000000000..9b96bb2ad --- /dev/null +++ b/ts/packages/knowledgeProcessor/test/transcript.spec.ts @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { parseTranscriptDuration } from "../src/conversation/transcript.js"; + +describe("Transcripts", () => { + test("duration", () => { + const dt = parseTranscriptDuration("February 2022", 60); + + const startDate = dt.startDate; + expect(startDate.date.day).toEqual(1); + expect(startDate.date.month).toEqual(2); + expect(startDate.date.year).toEqual(2022); + expect(startDate.time).toBeDefined(); + expect(startDate.time!.hour).toEqual(0); + + const stopDate = dt.stopDate; + expect(stopDate).toBeDefined(); + expect(stopDate!.date.day).toEqual(1); + expect(stopDate!.date.month).toEqual(2); + expect(stopDate!.date.year).toEqual(2022); + expect(stopDate!.time!.hour).toEqual(1); + }); +});