-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add plugin for document text splitting
- Loading branch information
1 parent
8bee95b
commit b9d935a
Showing
27 changed files
with
1,254 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
81 changes: 81 additions & 0 deletions
81
...ava/com/microsoft/semantickernel/samples/syntaxexamples/rag/DocumentSplittingExample.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
package com.microsoft.semantickernel.samples.syntaxexamples.rag; | ||
|
||
import com.microsoft.semantic.kernel.rag.splitting.Chunk; | ||
import com.microsoft.semantic.kernel.rag.splitting.Document; | ||
import com.microsoft.semantic.kernel.rag.splitting.Splitter; | ||
import java.io.ByteArrayInputStream; | ||
import java.io.IOException; | ||
import java.net.URI; | ||
import java.net.http.HttpClient; | ||
import java.net.http.HttpRequest; | ||
import java.net.http.HttpResponse; | ||
import java.net.http.HttpResponse.BodyHandlers; | ||
import java.util.List; | ||
import org.apache.pdfbox.io.RandomAccessReadBuffer; | ||
import org.apache.pdfbox.pdfparser.PDFParser; | ||
import org.apache.pdfbox.pdmodel.PDDocument; | ||
import org.apache.pdfbox.text.PDFTextStripper; | ||
import reactor.core.publisher.Flux; | ||
|
||
public class DocumentSplittingExample { | ||
|
||
private static String BENEFITS_DOC = "https://raw.githubusercontent.com/Azure-Samples/azure-search-openai-demo-java/refs/heads/main/data/Benefit_Options.pdf"; | ||
|
||
private static class PDFDocument implements Document { | ||
|
||
private final byte[] pdf; | ||
|
||
private PDFDocument(byte[] pdf) { | ||
this.pdf = pdf; | ||
} | ||
|
||
@Override | ||
public Flux<String> getContent() { | ||
try { | ||
PDFParser parser = new PDFParser( | ||
RandomAccessReadBuffer.createBufferFromStream(new ByteArrayInputStream(pdf))); | ||
PDDocument document = parser.parse(); | ||
String text = new PDFTextStripper().getText(document); | ||
|
||
return Flux.just(text); | ||
} catch (IOException e) { | ||
return Flux.error(e); | ||
} | ||
} | ||
} | ||
|
||
public static void main(String[] args) throws IOException, InterruptedException { | ||
byte[] pdfBytes = getPdfDoc(); | ||
PDFDocument pdfDoc = new PDFDocument(pdfBytes); | ||
|
||
Splitter splitter = Splitter | ||
.builder() | ||
.maxParagraphsPerChunk(4) | ||
.overlapNPercent(30.0f) | ||
.trimWhitespace() | ||
.build(); | ||
|
||
List<Chunk> chunks = splitter | ||
.splitDocument(pdfDoc) | ||
.collectList() | ||
.block(); | ||
|
||
chunks | ||
.forEach(chunk -> { | ||
System.out.println("========="); | ||
System.out.println(chunk.getContents()); | ||
}); | ||
} | ||
|
||
private static byte[] getPdfDoc() throws IOException, InterruptedException { | ||
HttpResponse<byte[]> doc = HttpClient.newHttpClient() | ||
.send(HttpRequest.newBuilder() | ||
.GET() | ||
.uri(URI.create(BENEFITS_DOC)) | ||
.build(), | ||
BodyHandlers.ofByteArray()); | ||
return doc.body(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
65 changes: 65 additions & 0 deletions
65
samples/semantickernel-sample-plugins/semantickernel-text-splitter-plugin/pom.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>com.microsoft.semantic-kernel</groupId> | ||
<artifactId>semantickernel-sample-plugins</artifactId> | ||
<version>1.3.1-SNAPSHOT</version> | ||
<relativePath>../pom.xml</relativePath> | ||
</parent> | ||
|
||
<artifactId>semantickernel-text-splitter-plugin</artifactId> | ||
<name>semantickernel-text-splitter-plugin</name> | ||
<packaging>jar</packaging> | ||
|
||
<dependencyManagement> | ||
<dependencies> | ||
<dependency> | ||
<groupId>com.microsoft.semantic-kernel</groupId> | ||
<artifactId>semantickernel-bom</artifactId> | ||
<version>${project.version}</version> | ||
<type>pom</type> | ||
<scope>import</scope> | ||
</dependency> | ||
</dependencies> | ||
</dependencyManagement> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>com.microsoft.semantic-kernel</groupId> | ||
<artifactId>semantickernel-api</artifactId> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.logging.log4j</groupId> | ||
<artifactId>log4j-api</artifactId> | ||
<scope>runtime</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.logging.log4j</groupId> | ||
<artifactId>log4j-core</artifactId> | ||
<scope>runtime</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.logging.log4j</groupId> | ||
<artifactId>log4j-slf4j2-impl</artifactId> | ||
<scope>runtime</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.fasterxml.jackson.core</groupId> | ||
<artifactId>jackson-databind</artifactId> | ||
<scope>compile</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.fasterxml.jackson.core</groupId> | ||
<artifactId>jackson-core</artifactId> | ||
<scope>compile</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.junit.jupiter</groupId> | ||
<artifactId>junit-jupiter-api</artifactId> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
</project> |
16 changes: 16 additions & 0 deletions
16
...text-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/Chunk.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
package com.microsoft.semantic.kernel.rag.splitting; | ||
|
||
public class Chunk { | ||
|
||
private final String chunk; | ||
|
||
public Chunk(String chunk) { | ||
this.chunk = chunk; | ||
} | ||
|
||
public String getContents() { | ||
return chunk; | ||
} | ||
|
||
} |
22 changes: 22 additions & 0 deletions
22
...r-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkEndCondition.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
package com.microsoft.semantic.kernel.rag.splitting; | ||
|
||
/** | ||
* Defines the condition that should be met for a chunk to be considered full. | ||
*/ | ||
public interface ChunkEndCondition { | ||
|
||
/** | ||
* Accepts a string and returns the number of character that should be considered as the end of | ||
* the FIRST chunk within the string. This method will be subsiquently called until all pages | ||
Check warning on line 11 in samples/semantickernel-sample-plugins/semantickernel-text-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkEndCondition.java
|
||
* are found. | ||
* <p> | ||
* Return -1 if the value does not contain enough characters to be considered as a full chunk. | ||
* | ||
* @param value the value to be checked | ||
* @return the index of the character that should be considered as the end of the first chunk in | ||
* the string | ||
*/ | ||
public int getEndOfNextChunk(String value); | ||
|
||
} |
9 changes: 9 additions & 0 deletions
9
...-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkPostProcessor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
package com.microsoft.semantic.kernel.rag.splitting; | ||
|
||
/** | ||
* A post processor that processes a chunk after it has been split. | ||
*/ | ||
public interface ChunkPostProcessor { | ||
Chunk process(Chunk chunk); | ||
} |
11 changes: 11 additions & 0 deletions
11
...t-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/Document.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
package com.microsoft.semantic.kernel.rag.splitting; | ||
|
||
import reactor.core.publisher.Flux; | ||
|
||
/** | ||
* A document to be read and split into chunks. | ||
*/ | ||
public interface Document { | ||
Flux<String> getContent(); | ||
} |
19 changes: 19 additions & 0 deletions
19
...er-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/OverlapCondition.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
package com.microsoft.semantic.kernel.rag.splitting; | ||
|
||
/** | ||
* Defines how much overlap is allowed between two pages. | ||
*/ | ||
public interface OverlapCondition { | ||
|
||
/** | ||
* Returns the index of the first character that should be considered as the beginning of the | ||
* overlap. | ||
* | ||
* @param chunk the chunk to be checked | ||
* @return the index of the first character that should be considered as the beginning of the | ||
* overlap | ||
*/ | ||
public int getOverlapIndex(String chunk); | ||
|
||
} |
Oops, something went wrong.