Skip to content
This repository was archived by the owner on Jul 7, 2020. It is now read-only.

Adding GetPlainText methods, fixing charmap #17

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Prev Previous commit
Next Next commit
Add function to get plain text from Page
ledongthuc committed Mar 12, 2017
commit b95967f4ea5d295d627d5767e70b1b4ddbd05f7d
56 changes: 56 additions & 0 deletions page.go
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
package pdf

import (
"bytes"
"fmt"
"strings"
)
@@ -401,6 +402,61 @@ type gstate struct {
CTM matrix
}

// GetPlainText returns the page's all text without format.
// - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try.
func (p Page) GetPlainText(seperator string) string {
strm := p.V.Key("Contents")

var textBuilder bytes.Buffer
showText := func(s string) {
_, err := textBuilder.WriteString(s)
if err != nil {
panic(err)
}
}

Interpret(strm, func(stk *Stack, op string) {
n := stk.Len()
args := make([]Value, n)
for i := n - 1; i >= 0; i-- {
args[i] = stk.Pop()
}

switch op {
default:
return
case "T*": // move to start of next line
showText(seperator)
case "\"": // set spacing, move to next line, and show text
if len(args) != 3 {
panic("bad \" operator")
}
fallthrough
case "'": // move to next line and show text
if len(args) != 1 {
panic("bad ' operator")
}
fallthrough
case "Tj": // show text
if len(args) != 1 {
panic("bad Tj operator")
}
showText(args[0].RawString())
showText(seperator)
case "TJ": // show text, allowing individual glyph positioning
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
showText(x.RawString())
showText(seperator)
}
}
}
})
return textBuilder.String()
}

// Content returns the page's content.
func (p Page) Content() Content {
strm := p.V.Key("Contents")