Skip to content

Latest commit

 

History

History
219 lines (173 loc) · 7.19 KB

File metadata and controls

219 lines (173 loc) · 7.19 KB

PDF Implementation for macai v2

Note: There were some build issues with the current implementation. Please follow these instructions for a successful implementation.

Proposed Implementation

Based on our work, here's the approach to implement PDF support in macai:

1. Add PDF Support to ImageAttachment.swift

// Add to imports
import PDFKit

// Add to ImageAttachment class
@Published var isPDF: Bool = false
@Published var pdfData: Data?

// In init(url:context:) method 
if self.originalFileType == .pdf {
    self.isPDF = true
    self.loadPDF()
} else {
    self.loadImage()
}

// Add PDF loading method
private func loadPDF() {
    isLoading = true
    
    DispatchQueue.global(qos: .userInitiated).async { [weak self] in
        guard let self = self, let url = self.url else { return }
        
        do {
            // Load PDF data
            let pdfData = try Data(contentsOf: url)
            self.pdfData = pdfData
            
            // Create thumbnail from first page
            if let pdfDocument = PDFDocument(url: url), 
               pdfDocument.pageCount > 0,
               let firstPage = pdfDocument.page(at: 0) {
                let pdfRect = firstPage.bounds(for: .mediaBox)
                let scale: CGFloat = 2.0
                let imageSize = NSSize(width: pdfRect.width * scale, height: pdfRect.height * scale)
                
                let thumbnail = NSImage(size: imageSize)
                thumbnail.lockFocus()
                
                NSColor.white.set()
                NSRect(origin: .zero, size: imageSize).fill()
                
                let context = NSGraphicsContext.current!.cgContext
                context.scaleBy(x: scale, y: scale)
                firstPage.draw(with: .mediaBox, to: context)
                
                thumbnail.unlockFocus()
                
                self.createThumbnail(from: thumbnail)
                self.saveToEntity(pdfData: pdfData)
                
                DispatchQueue.main.async {
                    self.isLoading = false
                }
            } else {
                throw NSError(
                    domain: "ImageAttachment",
                    code: 3,
                    userInfo: [NSLocalizedDescriptionKey: "Failed to create PDF thumbnail"]
                )
            }
        } catch {
            DispatchQueue.main.async {
                self.error = error
                self.isLoading = false
            }
        }
    }
}

// Update toBase64 method
func toBase64(convertPDFToImage: Bool = false) -> String? {
    // For PDFs, conditionally convert to image
    if isPDF, let pdfData = self.pdfData {
        if convertPDFToImage {
            // PDF -> Image conversion for OpenAI
            if let pdfDocument = PDFDocument(data: pdfData),
               pdfDocument.pageCount > 0,
               let firstPage = pdfDocument.page(at: 0) {
                
                let pdfRect = firstPage.bounds(for: .mediaBox)
                let scale: CGFloat = 2.0
                let imageSize = NSSize(width: pdfRect.width * scale, height: pdfRect.height * scale)
                
                let pdfImage = NSImage(size: imageSize)
                pdfImage.lockFocus()
                
                NSColor.white.set()
                NSRect(origin: .zero, size: imageSize).fill()
                
                let context = NSGraphicsContext.current!.cgContext
                context.scaleBy(x: scale, y: scale)
                firstPage.draw(with: .mediaBox, to: context)
                
                pdfImage.unlockFocus()
                
                let resizedImage = resizeImageIfNeeded(pdfImage)
                
                if let tiffData = resizedImage.tiffRepresentation,
                   let bitmapImage = NSBitmapImageRep(data: tiffData),
                   let pngData = bitmapImage.representation(using: .png, properties: [:]) {
                    return pngData.base64EncodedString()
                }
            }
            return nil
        } else {
            // Return PDF data directly for services that support PDFs
            return pdfData.base64EncodedString()
        }
    }
    
    // For images, convert to JPEG and return as base64
    guard let image = self.image else { return nil }
    
    let resizedImage = resizeImageIfNeeded(image)
    
    guard let tiffData = resizedImage.tiffRepresentation,
        let bitmapImage = NSBitmapImageRep(data: tiffData),
        let jpegData = bitmapImage.representation(using: .jpeg, properties: [.compressionFactor: 0.8])
    else {
        return nil
    }
    
    return jpegData.base64EncodedString()
}

// Add function to check file type
func getFileExtension() -> String {
    if isPDF {
        return "pdf"
    } else {
        return getFormatString(from: originalFileType)
    }
}

2. Modify ChatGPTHandler.swift

Add this method to get the ImageAttachment for a PDF:

// Load the full ImageAttachment object from CoreData for PDF conversion
private func loadImageAttachmentFromCoreData(uuid: UUID) -> ImageAttachment? {
    let viewContext = PersistenceController.shared.container.viewContext

    let fetchRequest: NSFetchRequest<ImageEntity> = ImageEntity.fetchRequest()
    fetchRequest.predicate = NSPredicate(format: "id == %@", uuid as CVarArg)
    fetchRequest.fetchLimit = 1

    do {
        let results = try viewContext.fetch(fetchRequest)
        if let imageEntity = results.first {
            return ImageAttachment(imageEntity: imageEntity)
        }
    }
    catch {
        print("Error fetching ImageAttachment from CoreData: \(error)")
    }

    return nil
}

Update the prepareRequest method to convert PDFs to images for OpenAI:

// When processing message content
if fileType == "pdf" {
    // Convert PDF to image and send as PNG
    if let attachment = self.loadImageAttachmentFromCoreData(uuid: uuid),
       let pdfAsImageBase64 = attachment.toBase64(convertPDFToImage: true) {
        
        contentArray.append([
            "type": "image_url",
            "image_url": ["url": "data:image/png;base64,\(pdfAsImageBase64)"],
        ])
        
        // Add a note about the PDF conversion
        if contentArray.isEmpty {
            contentArray.append([
                "type": "text", 
                "text": "Note: PDF document has been converted to an image."
            ])
        }
    }
}

3. Update UI Elements

  • Allow PDFs in file selection dialogs
  • Add explanatory messages about PDF conversion when using OpenAI
  • Update tooltips to reflect PDF support

Implementation Notes

  1. This approach allows PDFs to work with both Claude (which supports PDFs natively) and OpenAI (through conversion to images)
  2. Make sure to use NSBitmapImageRep.FileType.png instead of .png for correct enum usage
  3. Convert only the first page of PDFs to images for OpenAI
  4. Add appropriate error handling throughout the implementation
  5. Update the UI to give users clear feedback about PDF handling

Testing Approach

Test with both OpenAI/ChatGPT and Claude:

  1. Single page PDF with OpenAI (should show as image)
  2. Multi-page PDF with OpenAI (should show first page as image)
  3. PDF with Claude (should send as PDF file)
  4. Test both file picker and drag-and-drop