Compare commits

...

7 Commits

5 changed files with 233 additions and 26 deletions

View File

@ -1,4 +1,4 @@
Copyright (c) 2017 - 2020, Přemysl Eric Janouch <p@janouch.name>
Copyright (c) 2017 - 2021, Přemysl Eric Janouch <p@janouch.name>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted.

View File

@ -29,6 +29,11 @@ In addition to the C++ version, also included is a native Go port:
$ go get janouch.name/pdf-simple-sign/cmd/pdf-simple-sign
And a crude external VFS for Midnight Commander, that may be used to extract
all streams from a given PDF file:
$ go get janouch.name/pdf-simple-sign/cmd/extfs-pdf
Contributing and Support
------------------------
Use https://git.janouch.name/p/pdf-simple-sign to report bugs, request features,

132
cmd/extfs-pdf/main.go Normal file
View File

@ -0,0 +1,132 @@
//
// Copyright (c) 2021, Přemysl Eric Janouch <p@janouch.name>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// extfs-pdf is an external VFS plugin for Midnight Commander.
// More serious image extractors should rewrite this to use pdfimages(1).
package main
import (
"flag"
"fmt"
"os"
"janouch.name/pdf-simple-sign/pdf"
)
func die(status int, format string, args ...interface{}) {
os.Stderr.WriteString(fmt.Sprintf(format+"\n", args...))
os.Exit(status)
}
func usage() {
die(1, "Usage: %s [-h] COMMAND DOCUMENT [ARG...]", os.Args[0])
}
func streamSuffix(o *pdf.Object) string {
if filter, _ := o.Dict["Filter"]; filter.Kind == pdf.Name {
switch filter.String {
case "JBIG2Decode":
// This is the file extension used by pdfimages(1).
// This is not a complete JBIG2 standalone file.
return "jb2e"
case "JPXDecode":
return "jp2"
case "DCTDecode":
return "jpg"
default:
return filter.String
}
}
return "stream"
}
func list(updater *pdf.Updater) {
for _, o := range updater.ListIndirect() {
object, err := updater.Get(o.N, o.Generation)
size := 0
if err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
} else {
// Accidental transformation, retrieving original data is more work.
size = len(object.Serialize())
}
fmt.Printf("-r--r--r-- 1 0 0 %d 01-01-1970 00:00 %d-%d\n",
size, o.N, o.Generation)
if object.Kind == pdf.Stream {
fmt.Printf("-r--r--r-- 1 0 0 %d 01-01-1970 00:00 %d-%d.%s\n",
len(object.Stream), o.N, o.Generation, streamSuffix(&object))
}
}
}
func copyout(updater *pdf.Updater, storedFilename, extractTo string) {
var (
n, generation uint
suffix string
)
m, err := fmt.Sscanf(storedFilename, "%d-%d%s", &n, &generation, &suffix)
if m < 2 {
die(3, "%s: %s", storedFilename, err)
}
object, err := updater.Get(n, generation)
if err != nil {
die(3, "%s: %s", storedFilename, err)
}
content := []byte(object.Serialize())
if suffix != "" {
content = object.Stream
}
if err = os.WriteFile(extractTo, content, 0666); err != nil {
die(3, "%s", err)
}
}
func main() {
flag.Usage = usage
flag.Parse()
if flag.NArg() < 2 {
usage()
}
command, documentPath := flag.Arg(0), flag.Arg(1)
doc, err := os.ReadFile(documentPath)
if err != nil {
die(1, "%s", err)
}
updater, err := pdf.NewUpdater(doc)
if err != nil {
die(2, "%s", err)
}
switch command {
default:
die(1, "unsupported command: %s", command)
case "list":
if flag.NArg() != 2 {
usage()
} else {
list(updater)
}
case "copyout":
if flag.NArg() != 4 {
usage()
} else {
copyout(updater, flag.Arg(2), flag.Arg(3))
}
}
}

View File

@ -64,7 +64,7 @@ std::string ssprintf(const std::string& format, Args... args) {
// -------------------------------------------------------------------------------------------------
/// PDF token/object thingy. Objects may be composed either from one or a sequence of tokens.
/// The PDF Reference doesn't actually speak of tokens.
/// The PDF Reference doesn't actually speak of tokens, though ISO 32000-1:2008 does.
struct pdf_object {
enum type {
END, NL, COMMENT, NIL, BOOL, NUMERIC, KEYWORD, NAME, STRING,
@ -543,8 +543,8 @@ std::string pdf_updater::initialize() {
const auto prev_offset = trailer.dict.find("Prev");
if (prev_offset == trailer.dict.end())
break;
// FIXME we don't check for size_t over or underflow
if (!prev_offset->second.is_integer())
// FIXME do not read offsets and sizes as floating point numbers
if (!prev_offset->second.is_integer() || prev_offset->second.number < 0)
return "invalid Prev offset";
xref_offset = prev_offset->second.number;
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2018 - 2020, Přemysl Eric Janouch <p@janouch.name>
// Copyright (c) 2018 - 2021, Přemysl Eric Janouch <p@janouch.name>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted.
@ -59,20 +59,22 @@ const (
// higher-level objects
Array
Dict
Stream
Indirect
Reference
)
// Object is a PDF token/object thingy. Objects may be composed either from
// Object is a PDF token/object thingy. Objects may be composed either from
// one or a sequence of tokens. The PDF Reference doesn't actually speak
// of tokens.
// of tokens, though ISO 32000-1:2008 does.
type Object struct {
Kind ObjectKind
String string // Comment/Keyword/Name/String
Number float64 // Bool, Numeric
Array []Object // Array, Indirect
Dict map[string]Object // Dict, in the future also Stream
Dict map[string]Object // Dict, Stream
Stream []byte // Stream
N, Generation uint // Indirect, Reference
}
@ -458,6 +460,10 @@ func (o *Object) Serialize() string {
fmt.Fprint(b, " /", k, " ", v.Serialize())
}
return "<<" + b.String() + " >>"
case Stream:
d := NewDict(o.Dict)
d.Dict["Length"] = NewNumeric(float64(len(o.Stream)))
return d.Serialize() + "\nstream\n" + string(o.Stream) + "\nendstream"
case Indirect:
return fmt.Sprintf("%d %d obj\n%s\nendobj", o.N, o.Generation,
o.Array[0].Serialize())
@ -497,6 +503,65 @@ type Updater struct {
Trailer map[string]Object
}
// ListIndirect returns the whole cross-reference table as Reference Objects.
func (u *Updater) ListIndirect() []Object {
result := []Object{}
for i := 0; i < len(u.xref); i++ {
if u.xref[i].nonfree {
result = append(result, NewReference(uint(i), u.xref[i].generation))
}
}
return result
}
func (u *Updater) parseStream(lex *Lexer, stack *[]Object) (Object, error) {
lenStack := len(*stack)
if lenStack < 1 {
return newError("missing stream dictionary")
}
dict := (*stack)[lenStack-1]
if dict.Kind != Dict {
return newError("stream not preceded by a dictionary")
}
*stack = (*stack)[:lenStack-1]
length, ok := dict.Dict["Length"]
if !ok {
return newError("missing stream Length")
}
length, err := u.Dereference(length)
if err != nil {
return length, err
}
if !length.IsUint() || length.Number > math.MaxInt {
return newError("stream Length not an unsigned integer")
}
// Expect exactly one newline.
if nl, err := lex.Next(); err != nil {
return nl, err
} else if nl.Kind != NL {
return newError("stream does not start with a newline")
}
size := int(length.Number)
if len(lex.P) < size {
return newError("stream is longer than the document")
}
dict.Kind = Stream
dict.Stream = lex.P[:size]
lex.P = lex.P[size:]
// Skip any number of trailing newlines or comments.
if end, err := u.parse(lex, stack); err != nil {
return end, err
} else if end.Kind != Keyword || end.String != "endstream" {
return newError("improperly terminated stream")
}
return dict, nil
}
func (u *Updater) parseIndirect(lex *Lexer, stack *[]Object) (Object, error) {
lenStack := len(*stack)
if lenStack < 2 {
@ -590,15 +655,11 @@ func (u *Updater) parse(lex *Lexer, stack *[]Object) (Object, error) {
}
return NewDict(dict), nil
case Keyword:
// Appears in the document body, typically needs
// to access the cross-reference table.
//
// TODO(p): Use the xref to read /Length etc. once we
// actually need to read such objects; presumably
// streams can use the Object.String member.
switch token.String {
case "stream":
return newError("streams are not supported yet")
// Appears in the document body,
// typically needs to access the cross-reference table.
return u.parseStream(lex, stack)
case "obj":
return u.parseIndirect(lex, stack)
case "R":
@ -722,7 +783,7 @@ func NewUpdater(document []byte) (*Updater, error) {
if !ok {
break
}
// FIXME: We don't check for size_t over or underflow.
// FIXME: Do not read offsets and sizes as floating point numbers.
if !prevOffset.IsInteger() {
return nil, errors.New("invalid Prev offset")
}
@ -766,8 +827,6 @@ func (u *Updater) Version(root *Object) int {
// Get retrieves an object by its number and generation--may return
// Nil or End with an error.
//
// TODO(p): We should fix all uses of this not to eat the error.
func (u *Updater) Get(n, generation uint) (Object, error) {
if n >= u.xrefSize {
return New(Nil), nil
@ -796,6 +855,14 @@ func (u *Updater) Get(n, generation uint) (Object, error) {
}
}
// Derefence dereferences Reference objects, and passes the other kinds through.
func (u *Updater) Dereference(o Object) (Object, error) {
if o.Kind != Reference {
return o, nil
}
return u.Get(o.N, o.Generation)
}
// Allocate allocates a new object number.
func (u *Updater) Allocate() uint {
n := u.xrefSize
@ -906,15 +973,15 @@ func NewDate(ts time.Time) Object {
// GetFirstPage retrieves the first page of the given page (sub)tree reference,
// or returns a Nil object if unsuccessful.
func (u *Updater) GetFirstPage(nodeN, nodeGeneration uint) Object {
obj, _ := u.Get(nodeN, nodeGeneration)
if obj.Kind != Dict {
func (u *Updater) GetFirstPage(node Object) Object {
obj, err := u.Dereference(node)
if err != nil || obj.Kind != Dict {
return New(Nil)
}
// Out of convenience; these aren't filled normally.
obj.N = nodeN
obj.Generation = nodeGeneration
obj.N = node.N
obj.Generation = node.Generation
if typ, ok := obj.Dict["Type"]; !ok || typ.Kind != Name {
return New(Nil)
@ -934,7 +1001,7 @@ func (u *Updater) GetFirstPage(nodeN, nodeGeneration uint) Object {
}
// XXX: Nothing prevents us from recursing in an evil circular graph.
return u.GetFirstPage(kids.Array[0].N, kids.Array[0].Generation)
return u.GetFirstPage(kids.Array[0])
}
// -----------------------------------------------------------------------------
@ -1128,7 +1195,10 @@ func Sign(document []byte, key crypto.PrivateKey, certs []*x509.Certificate,
if !ok || rootRef.Kind != Reference {
return nil, errors.New("trailer does not contain a reference to Root")
}
root, _ := pdf.Get(rootRef.N, rootRef.Generation)
root, err := pdf.Dereference(rootRef)
if err != nil {
return nil, fmt.Errorf("Root dictionary retrieval failed: %s", err)
}
if root.Kind != Dict {
return nil, errors.New("invalid Root dictionary reference")
}
@ -1182,7 +1252,7 @@ func Sign(document []byte, key crypto.PrivateKey, certs []*x509.Certificate,
if !ok || pagesRef.Kind != Reference {
return nil, errors.New("invalid Pages reference")
}
page := pdf.GetFirstPage(pagesRef.N, pagesRef.Generation)
page := pdf.GetFirstPage(pagesRef)
if page.Kind != Dict {
return nil, errors.New("invalid or unsupported page tree")
}