Skip to content

Commit

Permalink
v3.0.10: fix embedding token length bug
Browse files Browse the repository at this point in the history
- was erroneously comparing token length to byte count in an assert

Break query into chunks, create embeddings, find similar chunks

summary of diff --git  a/v3/core/chunk.go b/v3/core/chunk.go

- Break the query into chunks using `chunksFromString` instead of directly creating embeddings from the query
- Append each chunk's text to `queryStrings` array for embedding
- Create embeddings from the `queryStrings` array
- Check if the `embeddings` array is empty and return if true
- Calculate the average of the embeddings using `util.MeanVector`
- Use the averaged embedding to find the most similar chunks

Add comments to improve Cli, reference gitea/tea, consider urfave

summary of diff --git  a/v3/core/cli.go b/v3/core/cli.go

- Add comments to the Cli function noting potential improvements by referencing gitea/tea's approach and considering the use of urfave over kong

Import envi, add token count checks, enhance debugging

summary of diff --git  a/v3/core/document.go b/v3/core/document.go

- Import `github.com/stevegt/envi` to use environment variables
- Add checks to verify chunk text length using token count before setting chunks and for new chunks
- Assert that token count is below `g.embeddingTokenLimit` for both existing chunks and new chunks to prevent exceeding limits
- Utilize `envi.Bool` to conditionally perform debug checks based on the `DEBUG` environment variable being set
- Enhance debugging by ensuring chunk token counts do not exceed defined limits

Update grokker.go version from 3.0.9 to 3.0.10

summary of diff --git  a/v3/core/grokker.go b/v3/core/grokker.go

- Update version from 3.0.9 to 3.0.10 in grokker.go

Remove comments and enable debug logging in createEmbeddings

summary of diff --git  a/v3/core/openai.go b/v3/core/openai.go

- Remove unnecessary comments about exceeding max tokens in `createEmbeddings` function
- Enable debug logging for creating embeddings for each text chunk in `createEmbeddings` function

Move go-diff to own block; add envi v0.2.0 to require block

summary of diff --git  a/v3/go.mod b/v3/go.mod

- Move `github.com/sergi/go-diff v1.3.1` into its own require block
- Add `github.com/stevegt/envi v0.2.0` to the require block

Add envi v0.2.0 and goadapt v0.0.13 module info to go.sum

summary of diff --git  a/v3/go.sum b/v3/go.sum

- Add github.com/stevegt/envi v0.2.0 checksum and module information
- Add github.com/stevegt/goadapt v0.0.13 module information
  • Loading branch information
stevegt committed Feb 16, 2024
1 parent 16462bd commit b1b9347
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 9 deletions.
16 changes: 12 additions & 4 deletions v3/core/chunk.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,13 +261,21 @@ func (g *GrokkerInternal) similarChunks(embedding []float64, tokenLimit int, fil
// findChunks returns the most relevant chunks for a query, limited by tokenLimit.
func (g *GrokkerInternal) findChunks(query string, tokenLimit int, files []string) (chunks []*Chunk, err error) {
defer Return(&err)
// get the embeddings for the query.
embeddings, err := g.createEmbeddings([]string{query})
// break the query into chunks.
queryChunks, err := g.chunksFromString(nil, query, g.embeddingTokenLimit)
Ck(err)
queryEmbedding := embeddings[0]
if queryEmbedding == nil {
// get the embeddings for the chunks.
var queryStrings []string
for _, chunk := range queryChunks {
queryStrings = append(queryStrings, chunk.text)
}
embeddings, err := g.createEmbeddings(queryStrings)
Ck(err)
if len(embeddings) == 0 {
return
}
// average the embeddings.
queryEmbedding := util.MeanVector(embeddings)
// find the most similar chunks.
chunks, err = g.similarChunks(queryEmbedding, tokenLimit, files)
Ck(err)
Expand Down
2 changes: 2 additions & 0 deletions v3/core/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ func cmdInSlice(cmd string, cmds []string) bool {
// We use this function instead of kong.Parse() so that we can pass in
// the arguments to parse. This allows us to more easily test the
// cli subcommands, and could later ease e.g. WASM usage.
//
// XXX note how gitea/tea does this, also uses urfave instead of kong
func Cli(args []string, config *CliConfig) (rc int, err error) {
defer Return(&err)

Expand Down
22 changes: 22 additions & 0 deletions v3/core/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package core
import (
"path/filepath"

"github.com/stevegt/envi"
. "github.com/stevegt/goadapt"
)

Expand Down Expand Up @@ -41,6 +42,15 @@ func (g *GrokkerInternal) updateDocument(doc *Document) (updated bool, err error
// hash, offset, and length. We'll get embeddings later.
var newChunks []*Chunk
for _, chunk := range chunks {
if envi.Bool("DEBUG", false) {
// verify chunk text length
txt, err := g.chunkText(chunk, true, false)
Ck(err)
_, tokens, err := Tokenizer.Encode(txt)
Ck(err)
tc := len(tokens)
Assert(tc < g.embeddingTokenLimit, "chunk tokens %d exceeds limit %d: %v", tc, g.embeddingTokenLimit, chunk)
}
// setChunk unsets the stale bit if the chunk is already in the
// database.
// XXX move the stale bit unset to this loop instead, for readability.
Expand All @@ -53,6 +63,18 @@ func (g *GrokkerInternal) updateDocument(doc *Document) (updated bool, err error
Debug("found %d new chunks", len(newChunks))
// orphaned chunks will be garbage collected.

if envi.Bool("DEBUG", false) {
// verify newChunks text length
for _, chunk := range newChunks {
txt, err := g.chunkText(chunk, true, false)
Ck(err)
_, tokens, err := Tokenizer.Encode(txt)
Ck(err)
tc := len(tokens)
Assert(tc < g.embeddingTokenLimit, "chunk tokens %d exceeds limit %d: %v", tc, g.embeddingTokenLimit, chunk)
}
}

// For each new chunk, generate an embedding using the
// openai.Embedding.create() function. Store the embeddings for each
// chunk in a data structure such as a list or dictionary.
Expand Down
2 changes: 1 addition & 1 deletion v3/core/grokker.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ import (
const (
// See the "Semantic Versioning" section of the README for
// information on API and db stability and versioning.
version = "3.0.9"
version = "3.0.10"
)

type GrokkerInternal struct {
Expand Down
4 changes: 1 addition & 3 deletions v3/core/openai.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ func (g *GrokkerInternal) createEmbeddings(texts []string) (embeddings [][]float
// simply call c.CreateEmbeddings() once for each text chunk.
for i := 0; i < len(texts); i++ {
text := texts[i]
// XXX don't exceed max tokens

// set empty chunk embedding to nil
if len(text) == 0 {
embeddings = append(embeddings, nil)
Expand All @@ -46,7 +44,7 @@ func (g *GrokkerInternal) createEmbeddings(texts []string) (embeddings [][]float
Input: inputs,
Model: fabius_models.AdaEmbeddingV2,
}
// Debug("creating embedding for chunk %d of %d ...", i+1, len(texts))
Debug("creating embedding for chunk %d of %d ...", i+1, len(texts))
// Debug("text: %q", text)
// loop with backoff until we get a response
var res *openai.EmbeddingResponse
Expand Down
5 changes: 4 additions & 1 deletion v3/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ require (
github.com/tiktoken-go/tokenizer v0.1.0
)

require github.com/sergi/go-diff v1.3.1
require (
github.com/sergi/go-diff v1.3.1
github.com/stevegt/envi v0.2.0
)

require (
github.com/dlclark/regexp2 v1.9.0 // indirect
Expand Down
3 changes: 3 additions & 0 deletions v3/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ github.com/sashabaranov/go-openai v1.19.1 h1:lIAtrpgE6Lhc3avbWG7wV4zeRWVi4nymQ7I
github.com/sashabaranov/go-openai v1.19.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/stevegt/envi v0.2.0 h1:QIaNQz///ueqH/57ldwGQy0yRgaYr/d7JLBQfjg+RXs=
github.com/stevegt/envi v0.2.0/go.mod h1:Z8w7bE5V9Ce3H02CWNTEYkW3zWE8ulgfROtEt6ydDoY=
github.com/stevegt/goadapt v0.0.13/go.mod h1:BWNnTsXdIxaseRo0W/MoVgDeLNf+6L4S4fPhyAsBTi0=
github.com/stevegt/goadapt v0.7.0 h1:brUmaaA4mr3hqQfglDAQh7/MVSWak52mEAOzfbSoMDg=
github.com/stevegt/goadapt v0.7.0/go.mod h1:vquRbAl0Ek4iJHCvFUEDxziTsETR2HOT7r64NolhDKs=
github.com/stevegt/semver v0.0.0-20230512043732-92220054a49f h1:erQJkdWx1bhOImDDPiVoNy+qP8sBoOJ8EsJ7gUiy8S8=
Expand Down

0 comments on commit b1b9347

Please sign in to comment.