LibrePDF · ConvoluteHumanBot · Oct 23, 2025 · Oct 23, 2025
diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2008 by Kevin Day.
+ *
+ * The contents of this file are subject to the Mozilla Public License Version 1.1
+ * (the "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the License.
+ *
+ * The Original Code is 'iText, a free JAVA-PDF library'.
+ *
+ * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
+ * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
+ * All Rights Reserved.
+ * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
+ * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
+ *
+ * Contributor(s): all the names of the contributors are added in the source code
+ * where applicable.
+ *
+ * Alternatively, the contents of this file may be used under the terms of the
+ * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
+ * provisions of LGPL are applicable instead of those above.  If you wish to
+ * allow use of your version of this file only under the terms of the LGPL
+ * License and not to allow others to use your version of this file under
+ * the MPL, indicate your decision by deleting the provisions above and
+ * replace them with the notice and other provisions required by the LGPL.
+ * If you do not delete the provisions above, a recipient may use your version
+ * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the MPL as stated above or under the terms of the GNU
+ * Library General Public License as published by the Free Software Foundation;
+ * either version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
+ * details.
+ *
+ * If you didn't download this code from the following link, you should check if
+ * you aren't using an obsolete version:
+ * https://github.com/LibrePDF/OpenPDF
+ */
+package org.openpdf.text.pdf.parser;
+
+public class MatchedPattern {
+
+    private final String text;
+    private final int page;
+    private final float[] coordinates = new float[4];
+
+    /**
+     * Constructor to pair a strip of text with its bounding box coordinates inside a page.
+     * The coordinates system has the origin (0, 0) in the lower left point of the page
+     * and uses PDF points as unit measure.
+     *
+     * @param text          string
+     * @param page          int
+     * @param llx           float lower left x coordinate
+     * @param lly           float lower left y coordinate
+     * @param urx           float upper right x coordinate
+     * @param ury           float upper right y coordinate
+     */
+    MatchedPattern(String text, int page, float llx, float lly, float urx, float ury) {
+        this.text = text;
+        this.page = page;
+        coordinates[0] = llx;
+        coordinates[1] = lly;
+        coordinates[2] = urx;
+        coordinates[3] = ury;
+    }
+
+    public String getText() {
+        return text;
+    }
+
+    public int getPage() {
+        return page;
+    }
+
+    public float[] getCoordinates() {
+        return coordinates;
+    }
+
+    @Override
+    public String toString() {
+        String[] c = new String[4];
+        for(int i = 0; i < 4; i++) {
+            c[i] = String.valueOf(coordinates[i]);
+        }
+        return "[" + String.join(", ", c) + "]";
+    }
+
+}
diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/ParsedText.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/ParsedText.java
@@ -67,6 +67,35 @@
      */
     private PdfString pdfText = null;
 
+    static protected ParsedText create(PdfString text, GraphicsState graphicsState, Matrix textMatrix) {
+        String decoded = "";
+        byte[] bytes;
+        if (BaseFont.IDENTITY_H.equals(graphicsState.getFont().getEncoding())) {
+            if (graphicsState.getFont().hasUnicodeCMAP()) {
+                if (graphicsState.getFont().hasTwoByteUnicodeCMAP()) {
+                    text = new PdfString(text.toString(), "IDENTITY_H2");
+                } else {
+                    text = new PdfString(text.toString(), "IDENTITY_H1");
+                }
+            } else {
+                text = new PdfString(new String(text.getBytes(), StandardCharsets.UTF_16));
+            }
+            bytes = text.toString().getBytes(StandardCharsets.UTF_16);
+        } else {
+            bytes = text.toString().getBytes();
+        }
+        decoded = graphicsState.getFont().decode(bytes, 0, bytes.length);
+        char[] chars = decoded.toCharArray();
+        float totalWidth = 0;
+        for (char c : chars) {
+            float w = graphicsState.getFont().getWidth(c) / 1000.0f;
+            float wordSpacing = Character.isSpaceChar(c) ? graphicsState.getWordSpacing() : 0f;
+            float blockWidth = (w * graphicsState.getFontSize() + graphicsState.getCharacterSpacing() + wordSpacing)
+                    * graphicsState.getHorizontalScaling();
+            totalWidth += blockWidth;
+        }
+        return new ParsedText(text, totalWidth, graphicsState, textMatrix);
+    }
 
     /**
      * This constructor should only be called when the origin for text display is at (0,0) and the graphical state
@@ -76,42 +105,34 @@
      * @param graphicsState graphical state
      * @param textMatrix    transform from text space to graphics (drawing space)
      */
-    ParsedText(PdfString text, GraphicsState graphicsState, Matrix textMatrix) {
-        this(text, new GraphicsState(graphicsState), textMatrix.multiply(graphicsState.getCtm()),
+    private ParsedText(PdfString text, float unscaledWidth, GraphicsState graphicsState,
+            Matrix textMatrix) {
+        this(text, unscaledWidth, new GraphicsState(graphicsState),
+                textMatrix.multiply(graphicsState.getCtm()),
                 getUnscaledFontSpaceWidth(graphicsState));
     }
 
     /**
      * Internal constructor for a parsed text item. The constructors that call it gather some information from the
      * graphical state first.
      *
-     * @param text          This is a PdfString containing code points for the current font, not actually characters. If
-     *                      the font has multiByte glyphs, (Identity-H encoding) we reparse the string so that the code
-     *                      points don't get split into multiple characters.
-     * @param graphicsState graphical state
-     * @param textMatrix    transform from text space to graphics (drawing space)
-     * @param unscaledWidth width of the space character in the font.
+     * @param text               This is a PdfString containing code points for the current font, not actually
+     *                           characters. If the font has multiByte glyphs, (Identity-H encoding) we reparse the
+     *                           string so that the code points don't get split into multiple characters.
+     * @param graphicsState      graphical state
+     * @param textMatrix         transform from text space to graphics (drawing space)
+     * @param unscaledSpaceWidth width of the space character in the font.
      */
-    private ParsedText(PdfString text, GraphicsState graphicsState, Matrix textMatrix, float unscaledWidth) {
+    private ParsedText(PdfString text, float unscaledWidth, GraphicsState graphicsState,
+            Matrix textMatrix,
+            float unscaledSpaceWidth) {
         super(null, pointToUserSpace(0, 0, textMatrix),
-                pointToUserSpace(getStringWidth(text.toString(), graphicsState), 0f, textMatrix),
+                pointToUserSpace(unscaledWidth, 0f, textMatrix),
                 pointToUserSpace(1.0f, 0f, textMatrix),
                 convertHeightToUser(graphicsState.getFontAscentDescriptor(), textMatrix),
                 convertHeightToUser(graphicsState.getFontDescentDescriptor(), textMatrix),
-                convertWidthToUser(unscaledWidth, textMatrix));
-        if (BaseFont.IDENTITY_H.equals(graphicsState.getFont().getEncoding())) {
-            if (graphicsState.getFont().hasUnicodeCMAP()) {
-                if (graphicsState.getFont().hasTwoByteUnicodeCMAP()) {
-                    pdfText = new PdfString(text.toString(), "IDENTITY_H2");
-                } else {
-                    pdfText = new PdfString(text.toString(), "IDENTITY_H1");
-                }
-            } else {
-                pdfText = new PdfString(new String(text.getBytes(), StandardCharsets.UTF_16));
-            }
-        } else {
-            pdfText = text;
-        }
+                convertWidthToUser(unscaledSpaceWidth, textMatrix));
+        pdfText = text;
         textToUserSpaceTransformMatrix = textMatrix;
         this.graphicsState = graphicsState;
     }
@@ -199,22 +220,6 @@
         return distance(endPos, startPos);
     }
 
-    /**
-     * Decodes a Java String containing glyph ids encoded in the font's encoding, and determine the unicode equivalent
-     *
-     * @param in the String that needs to be decoded
-     * @return the decoded String
-     */
-    // FIXME unreachable block and default encoding
-    protected String decode(String in) {
-        byte[] bytes;
-        if (BaseFont.IDENTITY_H.equals(graphicsState.getFont().getEncoding())) {
-            bytes = in.getBytes(StandardCharsets.UTF_16);
-        }
-        bytes = in.getBytes();
-        return graphicsState.getFont().decode(bytes, 0, bytes.length);
-    }
-
     /**
      * This constructor should only be called when the origin for text display is at (0,0) and the graphical state
      * reflects all transformations of the baseline. This is in text space units.
@@ -258,7 +263,6 @@
         for (int i = 0; i < chars.length; i++) {
             char c = chars[i];
             float w = font.getWidth(c) / 1000.0f;
-
             if (hasSpace[i]) {
                 if (wordAccum.length() > 0) {
                     result.add(createWord(wordAccum, wordStartOffset, totalWidth, getBaseline(),
@@ -339,14 +343,6 @@
                 getSingleSpaceWidth(), wordsAreComplete, currentBreakBefore);
     }
 
-    /**
-     * @param gs graphic state including current transformation to page coordinates from text measurement
-     * @return the unscaled (i.e. in Text space) width of our text
-     */
-    public float getUnscaledTextWidth(GraphicsState gs) {
-        return getStringWidth(getFontCodes(), gs);
-    }
-
     /**
      * {@inheritDoc}
      *