Skip to content

Commit

Permalink
Add plugin for document text splitting
Browse files Browse the repository at this point in the history
  • Loading branch information
johnoliver committed Oct 4, 2024
1 parent 8bee95b commit b9d935a
Show file tree
Hide file tree
Showing 27 changed files with 1,254 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
Expand Down Expand Up @@ -81,6 +82,16 @@
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-aiservices-google</artifactId>
</dependency>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-text-splitter-plugin</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.3</version>
</dependency>

<dependency>
<groupId>com.google.cloud</groupId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantickernel.samples.syntaxexamples.rag;

import com.microsoft.semantic.kernel.rag.splitting.Chunk;
import com.microsoft.semantic.kernel.rag.splitting.Document;
import com.microsoft.semantic.kernel.rag.splitting.Splitter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.net.http.HttpResponse.BodyHandlers;
import java.util.List;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import reactor.core.publisher.Flux;

public class DocumentSplittingExample {

private static String BENEFITS_DOC = "https://raw.githubusercontent.com/Azure-Samples/azure-search-openai-demo-java/refs/heads/main/data/Benefit_Options.pdf";

private static class PDFDocument implements Document {

private final byte[] pdf;

private PDFDocument(byte[] pdf) {
this.pdf = pdf;
}

@Override
public Flux<String> getContent() {
try {
PDFParser parser = new PDFParser(
RandomAccessReadBuffer.createBufferFromStream(new ByteArrayInputStream(pdf)));
PDDocument document = parser.parse();
String text = new PDFTextStripper().getText(document);

return Flux.just(text);
} catch (IOException e) {
return Flux.error(e);
}
}
}

public static void main(String[] args) throws IOException, InterruptedException {
byte[] pdfBytes = getPdfDoc();
PDFDocument pdfDoc = new PDFDocument(pdfBytes);

Splitter splitter = Splitter
.builder()
.maxParagraphsPerChunk(4)
.overlapNPercent(30.0f)
.trimWhitespace()
.build();

List<Chunk> chunks = splitter
.splitDocument(pdfDoc)
.collectList()
.block();

chunks
.forEach(chunk -> {
System.out.println("=========");
System.out.println(chunk.getContents());
});
}

private static byte[] getPdfDoc() throws IOException, InterruptedException {
HttpResponse<byte[]> doc = HttpClient.newHttpClient()
.send(HttpRequest.newBuilder()
.GET()
.uri(URI.create(BENEFITS_DOC))
.build(),
BodyHandlers.ofByteArray());
return doc.body();
}

}
4 changes: 3 additions & 1 deletion samples/semantickernel-sample-plugins/pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
Expand All @@ -15,5 +16,6 @@
<modules>
<module>semantickernel-openapi-plugin</module>
<module>semantickernel-presidio-plugin</module>
<module>semantickernel-text-splitter-plugin</module>
</modules>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-sample-plugins</artifactId>
<version>1.3.1-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

<artifactId>semantickernel-text-splitter-plugin</artifactId>
<name>semantickernel-text-splitter-plugin</name>
<packaging>jar</packaging>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-bom</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>

<dependencies>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-api</artifactId>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

public class Chunk {

private final String chunk;

public Chunk(String chunk) {
this.chunk = chunk;
}

public String getContents() {
return chunk;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

/**
* Defines the condition that should be met for a chunk to be considered full.
*/
public interface ChunkEndCondition {

/**
* Accepts a string and returns the number of character that should be considered as the end of
* the FIRST chunk within the string. This method will be subsiquently called until all pages

Check warning on line 11 in samples/semantickernel-sample-plugins/semantickernel-text-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkEndCondition.java

View workflow job for this annotation

GitHub Actions / Spell Check with Typos

"subsiquently" should be "subsequently".
* are found.
* <p>
* Return -1 if the value does not contain enough characters to be considered as a full chunk.
*
* @param value the value to be checked
* @return the index of the character that should be considered as the end of the first chunk in
* the string
*/
public int getEndOfNextChunk(String value);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

/**
* A post processor that processes a chunk after it has been split.
*/
public interface ChunkPostProcessor {
Chunk process(Chunk chunk);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

import reactor.core.publisher.Flux;

/**
* A document to be read and split into chunks.
*/
public interface Document {
Flux<String> getContent();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

/**
* Defines how much overlap is allowed between two pages.
*/
public interface OverlapCondition {

/**
* Returns the index of the first character that should be considered as the beginning of the
* overlap.
*
* @param chunk the chunk to be checked
* @return the index of the first character that should be considered as the beginning of the
* overlap
*/
public int getOverlapIndex(String chunk);

}
Loading

0 comments on commit b9d935a

Please sign in to comment.