pdf-simple-sign/pdf/pdf.go
Přemysl Eric Janouch 32e9acfa77
Go: enable multiple updates in a sequence
This is not something anyone should do, but let's do things correctly.
2024-02-04 05:17:26 +01:00

1664 lines
44 KiB
Go

//
// Copyright (c) 2018 - 2024, Přemysl Eric Janouch <p@janouch.name>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// Package pdf signs PDF documents and provides some processing utilities.
package pdf
import (
"bytes"
"compress/zlib"
"encoding/binary"
"encoding/hex"
"errors"
"fmt"
"math"
"regexp"
"sort"
"strconv"
"strings"
"time"
"crypto"
"crypto/ecdsa"
"crypto/rsa"
"crypto/x509"
"go.mozilla.org/pkcs7"
"golang.org/x/crypto/pkcs12"
)
type ObjectKind int
const (
End ObjectKind = iota
NL
Comment
Nil
Bool
Numeric
Keyword
Name
String
// simple tokens
BArray
EArray
BDict
EDict
// higher-level objects
Array
Dict
Stream
Indirect
Reference
)
// Object is a PDF token/object thingy. Objects may be composed either from
// one or a sequence of tokens. The PDF Reference doesn't actually speak
// of tokens, though ISO 32000-1:2008 does.
type Object struct {
Kind ObjectKind
String string // Comment/Keyword/Name/String
Number float64 // Bool, Numeric
Array []Object // Array, Indirect
Dict map[string]Object // Dict, Stream
Stream []byte // Stream
N, Generation uint // Indirect, Reference
}
// IsInteger checks if the PDF object is an integer number.
func (o *Object) IsInteger() bool {
_, f := math.Modf(o.Number)
return o.Kind == Numeric && f == 0
}
// IsUint checks if the PDF object is an integer number that fits into a uint.
func (o *Object) IsUint() bool {
return o.IsInteger() && o.Number >= 0 && o.Number <= float64(^uint(0))
}
// A slew of constructors that will hopefully get all inlined.
// New returns a new Object of the given kind, with default values.
func New(kind ObjectKind) Object { return Object{Kind: kind} }
func NewComment(c string) Object { return Object{Kind: Comment, String: c} }
func NewKeyword(k string) Object { return Object{Kind: Keyword, String: k} }
func NewBool(b bool) Object {
var b64 float64
if b {
b64 = 1
}
return Object{Kind: Bool, Number: b64}
}
func NewNumeric(n float64) Object { return Object{Kind: Numeric, Number: n} }
func NewName(n string) Object { return Object{Kind: Name, String: n} }
func NewString(s string) Object { return Object{Kind: String, String: s} }
func NewArray(a []Object) Object {
return Object{Kind: Array, Array: a}
}
func NewDict(d map[string]Object) Object {
if d == nil {
d = make(map[string]Object)
}
return Object{Kind: Dict, Dict: d}
}
func NewStream(d map[string]Object, s []byte) Object {
if d == nil {
d = make(map[string]Object)
}
return Object{Kind: Stream, Dict: d, Stream: s}
}
func NewIndirect(o Object, n, generation uint) Object {
return Object{Kind: Indirect, N: n, Generation: generation,
Array: []Object{o}}
}
func NewReference(n, generation uint) Object {
return Object{Kind: Reference, N: n, Generation: generation}
}
func newError(msg string) (Object, error) { return New(End), errors.New(msg) }
// -----------------------------------------------------------------------------
const (
octAlphabet = "01234567"
decAlphabet = "0123456789"
hexAlphabet = "0123456789abcdefABCDEF"
whitespace = "\t\n\f\r "
delimiters = "()<>[]{}/%"
)
// Lexer is a basic lexical analyser for the Portable Document Format,
// giving limited error information.
type Lexer struct {
P []byte // input buffer
}
func (lex *Lexer) read() (byte, bool) {
if len(lex.P) > 0 {
ch := lex.P[0]
lex.P = lex.P[1:]
return ch, true
}
return 0, false
}
func (lex *Lexer) peek() (byte, bool) {
if len(lex.P) > 0 {
return lex.P[0], true
}
return 0, false
}
func (lex *Lexer) eatNewline(ch byte) bool {
if ch == '\r' {
if ch, _ := lex.peek(); ch == '\n' {
lex.read()
}
return true
}
return ch == '\n'
}
func (lex *Lexer) unescape(ch byte) byte {
switch ch {
case 'n':
return '\n'
case 'r':
return '\r'
case 't':
return '\t'
case 'b':
return '\b'
case 'f':
return '\f'
}
if strings.IndexByte(octAlphabet, ch) >= 0 {
octal := []byte{ch}
lex.read()
if ch, _ := lex.peek(); strings.IndexByte(octAlphabet, ch) >= 0 {
octal = append(octal, ch)
lex.read()
}
if ch, _ := lex.peek(); strings.IndexByte(octAlphabet, ch) >= 0 {
octal = append(octal, ch)
lex.read()
}
u, _ := strconv.ParseUint(string(octal), 8, 8)
return byte(u)
}
return ch
}
func (lex *Lexer) string() (Object, error) {
var value []byte
parens := 1
for {
ch, ok := lex.read()
if !ok {
return newError("unexpected end of string")
}
if lex.eatNewline(ch) {
ch = '\n'
} else if ch == '(' {
parens++
} else if ch == ')' {
if parens--; parens == 0 {
break
}
} else if ch == '\\' {
if ch, ok = lex.read(); !ok {
return newError("unexpected end of string")
} else if lex.eatNewline(ch) {
continue
} else {
ch = lex.unescape(ch)
}
}
value = append(value, ch)
}
return NewString(string(value)), nil
}
func (lex *Lexer) stringHex() (Object, error) {
var value, buf []byte
for {
ch, ok := lex.read()
if !ok {
return newError("unexpected end of hex string")
} else if ch == '>' {
break
} else if strings.IndexByte(hexAlphabet, ch) < 0 {
return newError("invalid hex string")
} else if buf = append(buf, ch); len(buf) == 2 {
u, _ := strconv.ParseUint(string(buf), 16, 8)
value = append(value, byte(u))
buf = nil
}
}
if len(buf) > 0 {
u, _ := strconv.ParseUint(string(buf)+"0", 16, 8)
value = append(value, byte(u))
}
return NewString(string(value)), nil
}
func (lex *Lexer) name() (Object, error) {
var value []byte
for {
ch, ok := lex.peek()
if !ok || strings.IndexByte(whitespace+delimiters, ch) >= 0 {
break
}
lex.read()
if ch == '#' {
var hexa []byte
if ch, _ := lex.peek(); strings.IndexByte(hexAlphabet, ch) >= 0 {
hexa = append(hexa, ch)
lex.read()
}
if ch, _ := lex.peek(); strings.IndexByte(hexAlphabet, ch) >= 0 {
hexa = append(hexa, ch)
lex.read()
}
if len(hexa) != 2 {
return newError("invalid name hexa escape")
}
u, _ := strconv.ParseUint(string(value), 16, 8)
ch = byte(u)
}
value = append(value, ch)
}
if len(value) == 0 {
return newError("unexpected end of name")
}
return NewName(string(value)), nil
}
func (lex *Lexer) comment() (Object, error) {
var value []byte
for {
ch, ok := lex.peek()
if !ok || ch == '\r' || ch == '\n' {
break
}
value = append(value, ch)
lex.read()
}
return NewComment(string(value)), nil
}
// XXX: Maybe invalid numbers should rather be interpreted as keywords.
func (lex *Lexer) number() (Object, error) {
var value []byte
ch, ok := lex.peek()
if ch == '-' {
value = append(value, ch)
lex.read()
}
real, digits := false, false
for {
ch, ok = lex.peek()
if !ok {
break
} else if strings.IndexByte(decAlphabet, ch) >= 0 {
digits = true
} else if ch == '.' && !real {
real = true
} else {
break
}
value = append(value, ch)
lex.read()
}
if !digits {
return newError("invalid number")
}
f, _ := strconv.ParseFloat(string(value), 64)
return NewNumeric(f), nil
}
func (lex *Lexer) Next() (Object, error) {
ch, ok := lex.peek()
if !ok {
return New(End), nil
}
if strings.IndexByte("-0123456789.", ch) >= 0 {
return lex.number()
}
// {} end up being keywords, we might want to error out on those.
var value []byte
for {
ch, ok := lex.peek()
if !ok || strings.IndexByte(whitespace+delimiters, ch) >= 0 {
break
}
value = append(value, ch)
lex.read()
}
switch v := string(value); v {
case "":
case "null":
return New(Nil), nil
case "true":
return NewBool(true), nil
case "false":
return NewBool(false), nil
default:
return NewKeyword(v), nil
}
switch ch, _ := lex.read(); ch {
case '/':
return lex.name()
case '%':
return lex.comment()
case '(':
return lex.string()
case '[':
return New(BArray), nil
case ']':
return New(EArray), nil
case '<':
if ch, _ := lex.peek(); ch == '<' {
lex.read()
return New(BDict), nil
}
return lex.stringHex()
case '>':
if ch, _ := lex.peek(); ch == '>' {
lex.read()
return New(EDict), nil
}
return newError("unexpected '>'")
default:
if lex.eatNewline(ch) {
return New(NL), nil
}
if strings.IndexByte(whitespace, ch) >= 0 {
return lex.Next()
}
return newError("unexpected input")
}
}
// -----------------------------------------------------------------------------
// FIXME: Lines /should not/ be longer than 255 characters,
// some wrapping is in order.
func (o *Object) Serialize() string {
switch o.Kind {
case NL:
return "\n"
case Nil:
return "null"
case Bool:
if o.Number != 0 {
return "true"
}
return "false"
case Numeric:
return strconv.FormatFloat(o.Number, 'f', -1, 64)
case Keyword:
return o.String
case Name:
escaped := []byte{'/'}
for _, ch := range []byte(o.String) {
escaped = append(escaped, ch)
if ch == '#' || strings.IndexByte(delimiters+whitespace, ch) >= 0 {
escaped = append(escaped, fmt.Sprintf("%02x", ch)...)
}
}
return string(escaped)
case String:
escaped := []byte{'('}
for _, ch := range []byte(o.String) {
if ch == '\\' || ch == '(' || ch == ')' {
escaped = append(escaped, '\\')
}
escaped = append(escaped, ch)
}
return string(append(escaped, ')'))
case BArray:
return "["
case EArray:
return "]"
case BDict:
return "<<"
case EDict:
return ">>"
case Array:
var v []string
for _, i := range o.Array {
v = append(v, i.Serialize())
}
return "[ " + strings.Join(v, " ") + " ]"
case Dict:
b := bytes.NewBuffer(nil)
var keys []string
for k := range o.Dict {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
v := o.Dict[k]
// FIXME: The key is also supposed to be escaped by Serialize.
fmt.Fprint(b, " /", k, " ", v.Serialize())
}
return "<<" + b.String() + " >>"
case Stream:
d := NewDict(o.Dict)
d.Dict["Length"] = NewNumeric(float64(len(o.Stream)))
return d.Serialize() + "\nstream\n" + string(o.Stream) + "\nendstream"
case Indirect:
return fmt.Sprintf("%d %d obj\n%s\nendobj", o.N, o.Generation,
o.Array[0].Serialize())
case Reference:
return fmt.Sprintf("%d %d R", o.N, o.Generation)
default:
panic("unsupported token for serialization")
}
}
// -----------------------------------------------------------------------------
type ref struct {
offset int64 // file offset, or N of the next free entry, or index
generation uint // object generation
compressed *uint // PDF 1.5: N of the containing compressed object
nonfree bool // whether this N is taken (for a good zero value)
}
// Updater is a utility class to help read and possibly incrementally update
// PDF files.
type Updater struct {
// cross-reference table
xref []ref
// current cross-reference table size, correlated to len(xref)
xrefSize uint
// list of updated objects
// TODO(p): A map to bool makes this simpler to work with.
// The same with another map to struct{} somewhere in this code.
updated map[uint]struct{}
// PDF document data
Document []byte
// the new trailer dictionary to be written, initialized with the old one
Trailer map[string]Object
}
// ListIndirect returns the whole cross-reference table as Reference Objects.
func (u *Updater) ListIndirect() []Object {
result := []Object{}
for i := 0; i < len(u.xref); i++ {
if u.xref[i].nonfree {
result = append(result, NewReference(uint(i), u.xref[i].generation))
}
}
return result
}
func (u *Updater) parseStream(lex *Lexer, stack *[]Object) (Object, error) {
lenStack := len(*stack)
if lenStack < 1 {
return newError("missing stream dictionary")
}
dict := (*stack)[lenStack-1]
if dict.Kind != Dict {
return newError("stream not preceded by a dictionary")
}
*stack = (*stack)[:lenStack-1]
length, ok := dict.Dict["Length"]
if !ok {
return newError("missing stream Length")
}
length, err := u.Dereference(length)
if err != nil {
return length, err
}
if !length.IsUint() || length.Number > math.MaxInt {
return newError("stream Length not an unsigned integer")
}
// Expect exactly one newline.
if nl, err := lex.Next(); err != nil {
return nl, err
} else if nl.Kind != NL {
return newError("stream does not start with a newline")
}
size := int(length.Number)
if len(lex.P) < size {
return newError("stream is longer than the document")
}
dict.Kind = Stream
dict.Stream = lex.P[:size]
lex.P = lex.P[size:]
// Skip any number of trailing newlines or comments.
if end, err := u.parse(lex, stack); err != nil {
return end, err
} else if end.Kind != Keyword || end.String != "endstream" {
return newError("improperly terminated stream")
}
return dict, nil
}
func (u *Updater) parseIndirect(lex *Lexer, stack *[]Object) (Object, error) {
lenStack := len(*stack)
if lenStack < 2 {
return newError("missing object ID pair")
}
n := (*stack)[lenStack-2]
g := (*stack)[lenStack-1]
*stack = (*stack)[:lenStack-2]
if !g.IsUint() || !n.IsUint() {
return newError("invalid object ID pair")
}
var inner []Object
for {
object, _ := u.parse(lex, &inner)
if object.Kind == End {
return newError("object doesn't end")
}
if object.Kind == Keyword && object.String == "endobj" {
break
}
inner = append(inner, object)
}
if len(inner) != 1 {
return newError("indirect objects must contain exactly one object")
}
return NewIndirect(inner[0], uint(n.Number), uint(g.Number)), nil
}
func (u *Updater) parseR(stack *[]Object) (Object, error) {
lenStack := len(*stack)
if lenStack < 2 {
return newError("missing reference ID pair")
}
n := (*stack)[lenStack-2]
g := (*stack)[lenStack-1]
*stack = (*stack)[:lenStack-2]
if !g.IsUint() || !n.IsUint() {
return newError("invalid reference ID pair")
}
return NewReference(uint(n.Number), uint(g.Number)), nil
}
// parse reads an object at the lexer's position. Not a strict parser.
//
// TODO(p): We should fix all uses of this not to eat the error.
func (u *Updater) parse(lex *Lexer, stack *[]Object) (Object, error) {
switch token, err := lex.Next(); token.Kind {
case NL, Comment:
// These are not important to parsing,
// not even for this procedure's needs.
return u.parse(lex, stack)
case BArray:
var array []Object
for {
object, _ := u.parse(lex, &array)
if object.Kind == End {
return newError("array doesn't end")
}
if object.Kind == EArray {
break
}
array = append(array, object)
}
return NewArray(array), nil
case BDict:
var array []Object
for {
object, _ := u.parse(lex, &array)
if object.Kind == End {
return newError("dictionary doesn't end")
}
if object.Kind == EDict {
break
}
array = append(array, object)
}
if len(array)%2 != 0 {
return newError("unbalanced dictionary")
}
dict := make(map[string]Object)
for i := 0; i < len(array); i += 2 {
if array[i].Kind != Name {
return newError("invalid dictionary key type")
}
dict[array[i].String] = array[i+1]
}
return NewDict(dict), nil
case Keyword:
switch token.String {
case "stream":
// Appears in the document body,
// typically needs to access the cross-reference table.
return u.parseStream(lex, stack)
case "obj":
return u.parseIndirect(lex, stack)
case "R":
return u.parseR(stack)
}
fallthrough
default:
return token, err
}
}
func (u *Updater) loadXrefEntry(
n uint, r ref, loadedEntries map[uint]struct{}) {
if _, ok := loadedEntries[n]; ok {
return
}
if lenXref := uint(len(u.xref)); n >= lenXref {
u.xref = append(u.xref, make([]ref, n-lenXref+1)...)
}
loadedEntries[n] = struct{}{}
u.xref[n] = r
}
func (u *Updater) loadXrefStream(
lex *Lexer, stack []Object, loadedEntries map[uint]struct{}) (
Object, error) {
var object Object
for {
var err error
if object, err = u.parse(lex, &stack); err != nil {
return New(End), fmt.Errorf("invalid xref table: %s", err)
} else if object.Kind == End {
return newError("invalid xref table")
}
// For the sake of simplicity, keep stacking until we find an object.
if object.Kind == Indirect {
break
}
stack = append(stack, object)
}
// ISO 32000-2:2020 7.5.8.2 Cross-reference stream dictionary
stream := object.Array[0]
if stream.Kind != Stream {
return newError("invalid xref table")
}
if typ, ok := stream.Dict["Type"]; !ok ||
typ.Kind != Name || typ.String != "XRef" {
return newError("invalid xref stream")
}
data, err := u.GetStreamData(stream)
if err != nil {
return New(End), fmt.Errorf("invalid xref stream: %s", err)
}
size, ok := stream.Dict["Size"]
if !ok || !size.IsUint() || size.Number <= 0 {
return newError("invalid or missing cross-reference stream Size")
}
type pair struct{ start, count uint }
pairs := []pair{}
if index, ok := stream.Dict["Index"]; !ok {
pairs = append(pairs, pair{0, uint(size.Number)})
} else {
if index.Kind != Array || len(index.Array)%2 != 0 {
return newError("invalid cross-reference stream Index")
}
a := index.Array
for i := 0; i < len(a); i += 2 {
if !a[i].IsUint() || !a[i+1].IsUint() {
return newError("invalid cross-reference stream Index")
}
pairs = append(pairs, pair{uint(a[i].Number), uint(a[i+1].Number)})
}
}
w, ok := stream.Dict["W"]
if !ok || w.Kind != Array || len(w.Array) != 3 ||
!w.Array[0].IsUint() || !w.Array[1].IsUint() || !w.Array[2].IsUint() {
return newError("invalid or missing cross-reference stream W")
}
w1 := uint(w.Array[0].Number)
w2 := uint(w.Array[1].Number)
w3 := uint(w.Array[2].Number)
if w2 == 0 {
return newError("invalid cross-reference stream W")
}
unit := w1 + w2 + w3
if uint(len(data))%unit != 0 {
return newError("invalid cross-reference stream length")
}
readField := func(data []byte, width uint) (uint, []byte) {
var n uint
for ; width != 0; width-- {
n = n<<8 | uint(data[0])
data = data[1:]
}
return n, data
}
// ISO 32000-2:2020 7.5.8.3 Cross-reference stream data
for _, pair := range pairs {
for i := uint(0); i < pair.count; i++ {
if uint(len(data)) < unit {
return newError("premature cross-reference stream EOF")
}
var f1, f2, f3 uint = 1, 0, 0
if w1 > 0 {
f1, data = readField(data, w1)
}
f2, data = readField(data, w2)
if w3 > 0 {
f3, data = readField(data, w3)
}
var r ref
switch f1 {
case 0:
r.offset = int64(f2)
r.generation = f3
case 1:
r.offset = int64(f2)
r.generation = f3
r.nonfree = true
case 2:
r.offset = int64(f3)
r.compressed = &f2
r.nonfree = true
default:
// TODO(p): It should be treated as a reference to
// the null object. We can't currently represent that.
return newError("unsupported cross-reference stream contents")
}
u.loadXrefEntry(pair.start+i, r, loadedEntries)
}
}
stream.Kind = Dict
stream.Stream = nil
return stream, nil
}
func (u *Updater) loadXref(lex *Lexer, loadedEntries map[uint]struct{}) (
Object, error) {
var throwawayStack []Object
if object, _ := u.parse(lex,
&throwawayStack); object.Kind != Keyword || object.String != "xref" {
return u.loadXrefStream(lex, []Object{object}, loadedEntries)
}
for {
object, _ := u.parse(lex, &throwawayStack)
if object.Kind == End {
return newError("unexpected EOF while looking for the trailer")
}
if object.Kind == Keyword && object.String == "trailer" {
break
}
second, _ := u.parse(lex, &throwawayStack)
if !object.IsUint() || !second.IsUint() {
return newError("invalid xref section header")
}
start, count := uint(object.Number), uint(second.Number)
for i := uint(0); i < count; i++ {
off, _ := u.parse(lex, &throwawayStack)
gen, _ := u.parse(lex, &throwawayStack)
key, _ := u.parse(lex, &throwawayStack)
if !off.IsInteger() || off.Number < 0 ||
off.Number > float64(len(u.Document)) ||
!gen.IsInteger() || gen.Number < 0 || gen.Number > 65535 ||
key.Kind != Keyword {
return newError("invalid xref entry")
}
free := true
if key.String == "n" {
free = false
} else if key.String != "f" {
return newError("invalid xref entry")
}
u.loadXrefEntry(start+i, ref{
offset: int64(off.Number),
generation: uint(gen.Number),
nonfree: !free,
}, loadedEntries)
}
}
trailer, _ := u.parse(lex, &throwawayStack)
if trailer.Kind != Dict {
return newError("invalid trailer dictionary")
}
return trailer, nil
}
// -----------------------------------------------------------------------------
var trailerRE = regexp.MustCompile(`(?s:.*)\sstartxref\s+(\d+)\s+%%EOF`)
// NewUpdater initializes an Updater, building the cross-reference table and
// preparing a new trailer dictionary.
func NewUpdater(document []byte) (*Updater, error) {
u := &Updater{Document: document}
u.updated = make(map[uint]struct{})
// We only need to look for startxref roughly within
// the last kibibyte of the document.
haystack := u.Document
if len(haystack) > 1024 {
haystack = haystack[len(haystack)-1024:]
}
m := trailerRE.FindSubmatch(haystack)
if m == nil {
return nil, errors.New("cannot find startxref")
}
xrefOffset, _ := strconv.ParseInt(string(m[1]), 10, 64)
lastXrefOffset := xrefOffset
loadedXrefs := make(map[int64]struct{})
loadedEntries := make(map[uint]struct{})
for {
if _, ok := loadedXrefs[xrefOffset]; ok {
return nil, errors.New("circular xref offsets")
}
if xrefOffset >= int64(len(u.Document)) {
return nil, errors.New("invalid xref offset")
}
lex := Lexer{u.Document[xrefOffset:]}
trailer, err := u.loadXref(&lex, loadedEntries)
if err != nil {
return nil, err
}
if len(loadedXrefs) == 0 {
u.Trailer = trailer.Dict
}
loadedXrefs[xrefOffset] = struct{}{}
// TODO(p): Descend into XRefStm here first, if present,
// which is also a linked list.
// We allow for mixed cross-reference tables and streams
// within a single Prev list, although this should never occur.
prevOffset, ok := trailer.Dict["Prev"]
if !ok {
break
}
// FIXME: Do not read offsets and sizes as floating point numbers.
if !prevOffset.IsInteger() {
return nil, errors.New("invalid Prev offset")
}
xrefOffset = int64(prevOffset.Number)
}
u.Trailer["Prev"] = NewNumeric(float64(lastXrefOffset))
lastSize, ok := u.Trailer["Size"]
if !ok || !lastSize.IsInteger() || lastSize.Number <= 0 {
return nil, errors.New("invalid or missing cross-reference table Size")
}
u.xrefSize = uint(lastSize.Number)
return u, nil
}
var versionRE = regexp.MustCompile(
`(?:^|[\r\n])%(?:!PS-Adobe-\d\.\d )?PDF-(\d)\.(\d)[\r\n]`)
// Version extracts the claimed PDF version as a positive decimal number,
// e.g. 17 for PDF 1.7. Returns zero on failure.
func (u *Updater) Version(root *Object) int {
if version, ok := root.Dict["Version"]; ok && version.Kind == Name {
if v := version.String; len(v) == 3 && v[1] == '.' &&
v[0] >= '0' && v[0] <= '9' && v[2] >= '0' && v[2] <= '9' {
return int(v[0]-'0')*10 + int(v[2]-'0')
}
}
// We only need to look for the comment roughly within
// the first kibibyte of the document.
haystack := u.Document
if len(haystack) > 1024 {
haystack = haystack[:1024]
}
if m := versionRE.FindSubmatch(haystack); m != nil {
return int(m[1][0]-'0')*10 + int(m[2][0]-'0')
}
return 0
}
func (u *Updater) getFromObjStm(nObjStm, n uint) (Object, error) {
if nObjStm == n {
return newError("ObjStm recursion")
}
stream, err := u.Get(nObjStm, 0)
if err != nil {
return stream, err
}
if stream.Kind != Stream {
return newError("invalid ObjStm")
}
if typ, ok := stream.Dict["Type"]; !ok ||
typ.Kind != Name || typ.String != "ObjStm" {
return newError("invalid ObjStm")
}
data, err := u.GetStreamData(stream)
if err != nil {
return New(End), fmt.Errorf("invalid ObjStm: %s", err)
}
entryN, ok := stream.Dict["N"]
if !ok || !entryN.IsUint() || entryN.Number <= 0 {
return newError("invalid ObjStm N")
}
entryFirst, ok := stream.Dict["First"]
if !ok || !entryFirst.IsUint() || entryFirst.Number <= 0 {
return newError("invalid ObjStm First")
}
// NOTE: This means descending into that stream if n is not found here.
// It is meant to be an object reference.
if extends, ok := stream.Dict["Extends"]; ok && extends.Kind != Nil {
return newError("ObjStm extensions are unsupported")
}
count := uint(entryN.Number)
first := uint(entryFirst.Number)
if first > uint(len(data)) {
return newError("invalid ObjStm First")
}
lex1 := Lexer{data[:first]}
data = data[first:]
type pair struct{ n, offset uint }
pairs := []pair{}
for i := uint(0); i < count; i++ {
var throwawayStack []Object
objN, _ := u.parse(&lex1, &throwawayStack)
objOffset, _ := u.parse(&lex1, &throwawayStack)
if !objN.IsUint() || !objOffset.IsUint() {
return newError("invalid ObjStm pairs")
}
pairs = append(pairs, pair{uint(objN.Number), uint(objOffset.Number)})
}
for i, pair := range pairs {
if pair.offset > uint(len(data)) ||
i > 0 && pairs[i-1].offset >= pair.offset {
return newError("invalid ObjStm pairs")
}
}
for i, pair := range pairs {
if pair.n != n {
continue
}
if i+1 < len(pairs) {
data = data[pair.offset:pairs[i+1].offset]
} else {
data = data[pair.offset:]
}
lex2 := Lexer{data}
var stack []Object
for {
object, err := u.parse(&lex2, &stack)
if err != nil {
return object, err
} else if object.Kind == End {
break
} else {
stack = append(stack, object)
}
}
if len(stack) == 0 {
return newError("empty ObjStm object")
}
return stack[0], nil
}
return newError("object not found in ObjStm")
}
// Get retrieves an object by its number and generation--may return
// Nil or End with an error.
func (u *Updater) Get(n, generation uint) (Object, error) {
if n >= u.xrefSize {
return New(Nil), nil
}
ref := u.xref[n]
if !ref.nonfree || ref.generation != generation {
return New(Nil), nil
}
if ref.compressed != nil {
return u.getFromObjStm(*ref.compressed, n)
} else if ref.offset >= int64(len(u.Document)) {
return New(Nil), nil
}
lex := Lexer{u.Document[ref.offset:]}
var stack []Object
for {
object, err := u.parse(&lex, &stack)
if object.Kind == End {
return object, err
}
if object.Kind != Indirect {
stack = append(stack, object)
} else if object.N != n || object.Generation != generation {
return newError("object mismatch")
} else {
return object.Array[0], nil
}
}
}
// Derefence dereferences Reference objects, and passes the other kinds through.
func (u *Updater) Dereference(o Object) (Object, error) {
if o.Kind != Reference {
return o, nil
}
return u.Get(o.N, o.Generation)
}
// Allocate allocates a new object number.
func (u *Updater) Allocate() uint {
n := u.xrefSize
u.xrefSize++
if u.xrefSize == 0 {
panic("overflow")
} else if lenXref := uint(len(u.xref)); lenXref < u.xrefSize {
u.xref = append(u.xref, make([]ref, u.xrefSize-lenXref)...)
}
// We don't make sure it gets a subsection in the update yet because we
// make no attempts at fixing the linked list of free items either.
return n
}
// BytesWriter is an interface over a subset of bytes.Buffer methods.
type BytesWriter interface {
Bytes() []byte
Len() int
Write(p []byte) (n int, err error)
WriteByte(c byte) error
WriteRune(r rune) (n int, err error)
WriteString(s string) (n int, err error)
}
// Update appends an updated object to the end of the document.
// The fill callback must write exactly one PDF object.
func (u *Updater) Update(n uint, fill func(buf BytesWriter)) {
oldRef := u.xref[n]
u.updated[n] = struct{}{}
u.xref[n] = ref{
offset: int64(len(u.Document) + 1),
generation: oldRef.generation,
nonfree: true,
}
buf := bytes.NewBuffer(u.Document)
fmt.Fprintf(buf, "\n%d %d obj\n", n, oldRef.generation)
// Separately so that the callback can use w.Len() to get current offset.
fill(buf)
buf.WriteString("\nendobj")
u.Document = buf.Bytes()
}
func (u *Updater) flushXRefStm(updated []uint, buf *bytes.Buffer) {
// The cross-reference stream has to point to itself.
// XXX: We only duplicate Update code here due to how we currently buffer.
n := u.Allocate()
updated = append(updated, n)
u.updated[n] = struct{}{}
u.xref[n] = ref{
offset: int64(buf.Len() + 1),
generation: 0,
nonfree: true,
}
index, b := []Object{}, []byte{}
write := func(f1 byte, f2, f3 uint64) {
b = append(b, f1)
b = binary.BigEndian.AppendUint64(b, f2)
b = binary.BigEndian.AppendUint64(b, f3)
}
for i := 0; i < len(updated); {
start, stop := updated[i], updated[i]+1
for i++; i < len(updated) && updated[i] == stop; i++ {
stop++
}
index = append(index,
NewNumeric(float64(start)), NewNumeric(float64(stop-start)))
for ; start < stop; start++ {
ref := u.xref[start]
if ref.compressed != nil {
write(2, uint64(*ref.compressed), uint64(ref.offset))
} else if ref.nonfree {
write(1, uint64(ref.offset), uint64(ref.generation))
} else {
write(0, uint64(ref.offset), uint64(ref.generation))
}
}
}
u.Trailer["Size"] = NewNumeric(float64(u.xrefSize))
u.Trailer["Index"] = NewArray(index)
u.Trailer["W"] = NewArray([]Object{
NewNumeric(1), NewNumeric(8), NewNumeric(8),
})
for _, key := range []string{
"Filter", "DecodeParms", "F", "FFilter", "FDecodeParms", "DL"} {
delete(u.Trailer, key)
}
stream := NewStream(u.Trailer, b)
fmt.Fprintf(buf, "\n%d 0 obj\n%s\nendobj", n, stream.Serialize())
}
func (u *Updater) flushXRefTable(updated []uint, buf *bytes.Buffer) {
buf.WriteString("\nxref\n")
for i := 0; i < len(updated); {
start, stop := updated[i], updated[i]+1
for i++; i < len(updated) && updated[i] == stop; i++ {
stop++
}
fmt.Fprintf(buf, "%d %d\n", start, stop-start)
for ; start < stop; start++ {
// XXX: We should warn about any object streams here.
ref := u.xref[start]
if ref.nonfree && ref.compressed == nil {
fmt.Fprintf(buf, "%010d %05d n \n", ref.offset, ref.generation)
} else {
fmt.Fprintf(buf, "%010d %05d f \n", ref.offset, ref.generation)
}
}
}
// Taking literally "Each cross-reference section begins with a line
// containing the keyword xref. Following this line are one or more
// cross-reference subsections." from 3.4.3 in PDF Reference.
if len(updated) == 0 {
fmt.Fprintf(buf, "%d %d\n", 0, 0)
}
u.Trailer["Size"] = NewNumeric(float64(u.xrefSize))
trailer := NewDict(u.Trailer)
fmt.Fprintf(buf, "trailer\n%s", trailer.Serialize())
}
// FlushUpdates writes an updated cross-reference table and trailer, or stream.
func (u *Updater) FlushUpdates() {
updated := make([]uint, 0, len(u.updated))
for n := range u.updated {
updated = append(updated, n)
}
sort.Slice(updated, func(i, j int) bool {
return updated[i] < updated[j]
})
// It does not seem to be possible to upgrade a PDF file
// from trailer dictionaries to cross-reference streams,
// so keep continuity either way.
//
// (Downgrading from cross-reference streams using XRefStm would not
// create a true hybrid-reference file, although it should work.)
buf := bytes.NewBuffer(u.Document)
startXref := buf.Len() + 1 /* '\n' */
if typ, _ := u.Trailer["Type"]; typ.Kind == Name && typ.String == "XRef" {
u.flushXRefStm(updated, buf)
} else {
u.flushXRefTable(updated, buf)
}
fmt.Fprintf(buf, "\nstartxref\n%d\n%%%%EOF\n", startXref)
u.Document = buf.Bytes()
u.updated = make(map[uint]struct{})
u.Trailer["Prev"] = NewNumeric(float64(startXref))
}
// -----------------------------------------------------------------------------
// NewDate makes a PDF object representing the given point in time.
func NewDate(ts time.Time) Object {
buf := ts.AppendFormat(nil, "D:20060102150405")
// "Z07'00'" doesn't work, we need to do some of it manually.
if _, offset := ts.Zone(); offset != 0 {
o := ts.AppendFormat(nil, "-0700")
buf = append(buf, o[0], o[1], o[2], '\'', o[3], o[4], '\'')
} else {
buf = append(buf, 'Z')
}
return NewString(string(buf))
}
// GetStreamData returns the actual data stored in a stream object,
// applying any filters.
func (u *Updater) GetStreamData(stream Object) ([]byte, error) {
if f, ok := stream.Dict["F"]; ok && f.Kind != Nil {
return nil, errors.New("stream data in other files are unsupported")
}
// Support just enough to decode a common cross-reference stream.
if filter, ok := stream.Dict["Filter"]; !ok {
return stream.Stream, nil
} else if filter.Kind != Name || filter.String != "FlateDecode" {
return nil, errors.New("unsupported stream Filter")
}
// TODO(p): Support << /Columns N /Predictor 12 >>
// which usually appears in files with cross-reference streams.
if parms, ok := stream.Dict["DecodeParms"]; ok && parms.Kind != Nil {
return nil, errors.New("DecodeParms are not supported")
}
r, err := zlib.NewReader(bytes.NewReader(stream.Stream))
if err != nil {
return nil, err
}
var b bytes.Buffer
_, err = b.ReadFrom(r)
return b.Bytes(), err
}
// GetFirstPage retrieves the first page of the given page (sub)tree reference,
// or returns a Nil object if unsuccessful.
func (u *Updater) GetFirstPage(node Object) Object {
obj, err := u.Dereference(node)
if err != nil || obj.Kind != Dict {
return New(Nil)
}
// Out of convenience; these aren't filled normally.
obj.N = node.N
obj.Generation = node.Generation
if typ, ok := obj.Dict["Type"]; !ok || typ.Kind != Name {
return New(Nil)
} else if typ.String == "Page" {
return obj
} else if typ.String != "Pages" {
return New(Nil)
}
// XXX: Technically speaking, this may be an indirect reference.
// The correct way to solve this seems to be having Updater include
// a wrapper around "obj.Dict". Though does it still apply in Golang?
kids, ok := obj.Dict["Kids"]
if !ok || kids.Kind != Array || len(kids.Array) == 0 ||
kids.Array[0].Kind != Reference {
return New(Nil)
}
// XXX: Nothing prevents us from recursing in an evil circular graph.
return u.GetFirstPage(kids.Array[0])
}
// -----------------------------------------------------------------------------
// PKCS12Parse parses and verifies PKCS#12 data.
func PKCS12Parse(p12 []byte, password string) (
crypto.PrivateKey, []*x509.Certificate, error) {
// The pkcs12.Decode function doesn't support included intermediate
// certificates, we need to do some processing manually.
blocks, err := pkcs12.ToPEM(p12, password)
if err != nil {
return nil, nil, err
}
// b.Type is literally CERTIFICATE or PRIVATE KEY, the Headers only contain
// a localKeyId field. It seems like the pkey and the cert share the same
// localKeyId value. Though the leaf certificate should also be the first
// one in the PKCS#12 file, so I probably don't need that value.
var allX509Blocks [][]byte
var allCertBlocks [][]byte
for _, b := range blocks {
// CERTIFICATE, PRIVATE KEY constants are defined locally in the pkcs12
// package. crypto/tls/tls.go seems to only use literals for these and
// also accepts words in front such as RSA PRIVATE KEY.
switch b.Type {
case "PRIVATE KEY":
allX509Blocks = append(allX509Blocks, b.Bytes)
case "CERTIFICATE":
allCertBlocks = append(allCertBlocks, b.Bytes)
}
}
switch {
case len(allX509Blocks) == 0:
return nil, nil, errors.New("missing private key")
case len(allX509Blocks) > 1:
return nil, nil, errors.New("more than one private key")
case len(allCertBlocks) == 0:
return nil, nil, errors.New("missing certificate")
}
// The PKCS#12 file may only contain PKCS#8-wrapped private keys but the
// pkcs12 package unwraps them to simple PKCS#1/EC while converting to PEM.
var key crypto.PrivateKey
if key, err = x509.ParsePKCS1PrivateKey(allX509Blocks[0]); err != nil {
if key, err = x509.ParseECPrivateKey(allX509Blocks[0]); err == nil {
return nil, nil, errors.New("failed to parse private key")
}
}
x509Certs, err := x509.ParseCertificates(allCertBlocks[0])
if err != nil {
return nil, nil, err
}
if len(x509Certs) != 1 {
return nil, nil,
errors.New("expected exactly one certificate in the first bag")
}
for _, cert := range allCertBlocks[1:] {
toAdd, err := x509.ParseCertificates(cert)
if err != nil {
return nil, nil, err
}
x509Certs = append(x509Certs, toAdd...)
}
// Copied from crypto/tls/tls.go.
switch pub := x509Certs[0].PublicKey.(type) {
case *rsa.PublicKey:
priv, ok := key.(*rsa.PrivateKey)
if !ok {
return nil, nil,
errors.New("private key type does not match public key type")
}
if pub.N.Cmp(priv.N) != 0 {
return nil, nil,
errors.New("private key does not match public key")
}
case *ecdsa.PublicKey:
priv, ok := key.(*ecdsa.PrivateKey)
if !ok {
return nil, nil,
errors.New("private key type does not match public key type")
}
if pub.X.Cmp(priv.X) != 0 || pub.Y.Cmp(priv.Y) != 0 {
return nil, nil,
errors.New("private key does not match public key")
}
default:
return nil, nil, errors.New("unknown public key algorithm")
}
return key, x509Certs, nil
}
// FillInSignature signs PDF contents and writes the signature into the given
// window that has been reserved for this specific purpose.
// This is a very low-level function.
func FillInSignature(document []byte, signOff, signLen int,
key crypto.PrivateKey, certs []*x509.Certificate) error {
if signOff < 0 || signOff > len(document) ||
signLen < 2 || signOff+signLen > len(document) {
return errors.New("invalid signing window")
}
pkcsError := func(message interface{}) error {
return fmt.Errorf("key/cert: %s", message)
}
// Prevent useless signatures--makes pdfsig from poppler happy at least
// (and NSS by extension).
x509Cert := certs[0]
if x509Cert.KeyUsage&(x509.KeyUsageDigitalSignature|
x509.KeyUsageContentCommitment /* renamed non-repudiation */) == 0 {
return pkcsError("the certificate's key usage must include " +
"digital signatures or non-repudiation")
}
extOK := false
for _, u := range x509Cert.ExtKeyUsage {
if u == x509.ExtKeyUsageAny || u == x509.ExtKeyUsageEmailProtection {
extOK = true
}
}
if len(x509Cert.ExtKeyUsage) > 0 && !extOK {
return pkcsError("the certificate's extended key usage " +
"must include S/MIME")
}
// XXX: We'd like to stream to the hash manually instead of copying data.
data := make([]byte, len(document)-signLen)
copy(data, document[:signOff])
copy(data[signOff:], document[signOff+signLen:])
signedData, err := pkcs7.NewSignedData(data)
if err != nil {
return err
}
// The default digest is SHA1, which is mildly insecure now.
signedData.SetDigestAlgorithm(pkcs7.OIDDigestAlgorithmSHA256)
if err := signedData.AddSignerChain(
x509Cert, key, certs[1:], pkcs7.SignerInfoConfig{}); err != nil {
return err
}
signedData.Detach()
sig, err := signedData.Finish()
if err != nil {
return err
}
/*
Debugging: ioutil.WriteFile("pdf_signature.der", sig, 0666)
openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print
Context: https://stackoverflow.com/a/29253469
*/
if len(sig)*2 > signLen-2 /* hexstring quotes */ {
// The obvious solution is to increase the allocation... or spend
// a week reading specifications while losing all faith in humanity
// as a species, and skip the pkcs7 package entirely.
return fmt.Errorf("not enough space reserved for the signature "+
"(%d nibbles vs %d nibbles)", signLen-2, len(sig)*2)
}
hex.Encode(document[signOff+1:], sig)
return nil
}
// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf
// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf
// Sign signs the given document, growing and returning the passed-in slice.
// There must be at least one certificate, matching the private key.
// The certificates must form a chain.
//
// A good default for the reservation is around 4096 (the value is in bytes).
//
// The presumption here is that the document is valid and that it doesn't
// employ cross-reference streams from PDF 1.5, or at least constitutes
// a hybrid-reference file. The results with PDF 2.0 (2017) are currently
// unknown as the standard costs money.
func Sign(document []byte, key crypto.PrivateKey, certs []*x509.Certificate,
reservation int) ([]byte, error) {
pdf, err := NewUpdater(document)
if err != nil {
return nil, err
}
rootRef, ok := pdf.Trailer["Root"]
if !ok || rootRef.Kind != Reference {
return nil, errors.New("trailer does not contain a reference to Root")
}
root, err := pdf.Dereference(rootRef)
if err != nil {
return nil, fmt.Errorf("Root dictionary retrieval failed: %s", err)
}
if root.Kind != Dict {
return nil, errors.New("invalid Root dictionary reference")
}
// 8.7 Digital Signatures - /signature dictionary/
sigdictN := pdf.Allocate()
var byterangeOff, byterangeLen, signOff, signLen int
pdf.Update(sigdictN, func(buf BytesWriter) {
// The timestamp is important for Adobe Acrobat Reader DC.
// The ideal would be to use RFC 3161.
now := NewDate(time.Now())
buf.WriteString("<< /Type/Sig /Filter/Adobe.PPKLite" +
" /SubFilter/adbe.pkcs7.detached\n" +
" /M" + now.Serialize() + " /ByteRange ")
byterangeOff = buf.Len()
byterangeLen = 32 // fine for a gigabyte
buf.Write(bytes.Repeat([]byte{' '}, byterangeLen))
buf.WriteString("\n /Contents <")
signOff = buf.Len()
signLen = reservation * 2 // cert, digest, encrypted digest, ...
buf.Write(bytes.Repeat([]byte{'0'}, signLen))
buf.WriteString("> >>")
// We actually need to exclude the hexstring quotes from signing.
signOff -= 1
signLen += 2
})
sigfield := NewDict(map[string]Object{
// 8.6.3 Field Types - Signature Fields
"FT": NewName("Sig"),
"V": NewReference(sigdictN, 0),
// 8.4.5 Annotations Types - Widget Annotations
// We can merge the Signature Annotation and omit Kids here.
"Subtype": NewName("Widget"),
"F": NewNumeric(2 /* Hidden */),
"T": NewString("Signature1"),
"Rect": NewArray([]Object{
NewNumeric(0), NewNumeric(0), NewNumeric(0), NewNumeric(0),
}),
})
sigfieldN := pdf.Allocate()
pdf.Update(sigfieldN, func(buf BytesWriter) {
buf.WriteString(sigfield.Serialize())
})
pagesRef, ok := root.Dict["Pages"]
if !ok || pagesRef.Kind != Reference {
return nil, errors.New("invalid Pages reference")
}
page := pdf.GetFirstPage(pagesRef)
if page.Kind != Dict {
return nil, errors.New("invalid or unsupported page tree")
}
annots := page.Dict["Annots"]
if annots.Kind != Array {
// TODO(p): Indirectly referenced arrays might not be
// that hard to support.
if annots.Kind != End {
return nil, errors.New("unexpected Annots")
}
annots = NewArray(nil)
}
annots.Array = append(annots.Array, NewReference(sigfieldN, 0))
page.Dict["Annots"] = annots
pdf.Update(page.N, func(buf BytesWriter) {
buf.WriteString(page.Serialize())
})
// 8.6.1 Interactive Form Dictionary
if acroform, ok := root.Dict["AcroForm"]; ok && acroform.Kind != Nil {
return nil, errors.New("the document already contains forms, " +
"they would be overwritten")
}
root.Dict["AcroForm"] = NewDict(map[string]Object{
"Fields": NewArray([]Object{NewReference(sigfieldN, 0)}),
"SigFlags": NewNumeric(3 /* SignaturesExist | AppendOnly */),
})
// Upgrade the document version for SHA-256 etc.
if pdf.Version(&root) < 16 {
root.Dict["Version"] = NewName("1.6")
}
pdf.Update(rootRef.N, func(buf BytesWriter) {
buf.WriteString(root.Serialize())
})
pdf.FlushUpdates()
// Now that we know the length of everything, store byte ranges of
// what we're about to sign, which must be everything but the resulting
// signature itself.
tailOff := signOff + signLen
tailLen := len(pdf.Document) - tailOff
ranges := fmt.Sprintf("[0 %d %d %d]", signOff, tailOff, tailLen)
if len(ranges) > byterangeLen {
return nil, errors.New("not enough space reserved for /ByteRange")
}
copy(pdf.Document[byterangeOff:], []byte(ranges))
if err := FillInSignature(pdf.Document, signOff, signLen,
key, certs); err != nil {
return nil, err
}
return pdf.Document, nil
}