hex/plugins/pdf.lua

--
-- pdf.lua: Portable Document Format
--
-- Based on PDF Reference, version 1.7
-- In practice almost useless, I just wanted to learn about the file format.
-- FIXME: it's also not very robust and doesn't support all documents.
--
-- Copyright (c) 2017, Přemysl Eric Janouch <p@janouch.name>
--
-- Permission to use, copy, modify, and/or distribute this software for any
-- purpose with or without fee is hereby granted.
--
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-- SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-- OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-- CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--

local oct_alphabet = "01234567"
local dec_alphabet = "0123456789"
local hex_alphabet = "0123456789abcdefABCDEF"
local whitespace = "\x00\t\n\f\r "
local delimiters = "()<>[]{}/%"

local strchr = function (s, ch) return s:find (ch, 1, true) end

-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

local Lexer = {}
Lexer.__index = Lexer

function Lexer:new (c)
	return setmetatable ({ c = c }, self)
end

-- TODO: make it possible to follow a string, we should probably be able to
--   supply callbacks to the constructor, or a wrapper object;
--   this will be used for object streams
function Lexer:getc ()
	if self.c.eof then return nil end
	return self.c:read (1)
end

function Lexer:ungetc ()
	self.c.position = self.c.position - 1
end

function Lexer:token (type, value, description)
	if description then
		self.c (self.start, self.c.position - 1):mark (description)
	end
	return { type=type, value=value,
		start=self.start, stop=self.c.position - 1 }
end

-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

function Lexer:eat_newline (ch)
	if ch == '\r' then
		ch = self:getc ()
		if ch and ch ~= '\n' then self:ungetc () end
		return true
	elseif ch == '\n' then
		return true
	end
end

function Lexer:string ()
	local value, level, ch = "", 1
::continue::
	while true do
		ch = self:getc ()
		if not ch then return nil
		elseif ch == '\\' then
			ch = self:getc ()
			if not ch then return nil
			elseif ch == 'n' then ch = '\n'
			elseif ch == 'r' then ch = '\r'
			elseif ch == 't' then ch = '\t'
			elseif ch == 'b' then ch = '\b'
			elseif ch == 'f' then ch = '\f'
			elseif self:eat_newline (ch) then goto continue
			elseif strchr (oct_alphabet, ch) then
				local buf, i = ch
				for i = 1, 2 do
					ch = self:getc ()
					if not ch then return nil
					elseif not strchr (oct_alphabet, ch) then
						self:ungetc ()
						break
					end
					buf = buf .. ch
				end
				ch = string.char (tonumber (buf, 8))
			end
		elseif self:eat_newline (ch) then
			ch = '\n'
		elseif ch == '(' then
			level = level + 1
		elseif ch == ')' then
			level = level - 1
			if level == 0 then break end
		end
		value = value .. ch
	end
	return self:token ('string', value, "string literal")
end

function Lexer:string_hex ()
	local value, buf, ch = ""
	while true do
		ch = self:getc ()
		if not ch then return nil
		elseif ch == '>' then
			break
		elseif not strchr (hex_alphabet, ch) then
			return nil
		elseif buf then
			value = value .. string.char (tonumber (buf .. ch, 16))
			buf = nil
		else
			buf = ch
		end
	end
	if buf then value = value .. string.char (tonumber (buf .. '0', 16)) end
	return self:token ('string', value, "string hex")
end

function Lexer:name ()
	local value, ch = ""
	while true do
		ch = self:getc ()
		if not ch then break
		elseif ch == '#' then
			local ch1, ch2 = self:getc (), self:getc ()
			if not ch1 or not ch2
			or not strchr (hex_alphabet, ch1)
			or not strchr (hex_alphabet, ch2) then
				return nil
			end
			ch = string.char (tonumber (ch1 .. ch2, 16))
		elseif strchr (whitespace .. delimiters, ch) then
			self:ungetc ()
			break
		end
		value = value .. ch
	end
	if value == "" then return nil end
	return self:token ('name', value, "name")
end

function Lexer:comment ()
	local value, ch = ""
	while true do
		ch = self:getc ()
		if not ch then break
		elseif ch == '\r' or ch == '\n' then
			self:ungetc ()
			break
		end
		value = value .. ch
	end
	return self:token ('comment', value, "comment")
end

function Lexer:number (ch)
	local value, real, digits = "", false, false
	if ch == '-' then
		value = ch
		ch = self:getc ()
	end
	while ch do
		if strchr (dec_alphabet, ch) then
			digits = true
		elseif ch == '.' and not real then
			real = true
		else
			self:ungetc ()
			break
		end
		value = value .. ch
		ch = self:getc ()
	end
	-- XXX: perhaps we should instead let it be interpreted as a keyword
	if not digits then return nil end
	-- XXX: maybe we should differentiate between integers and real values
	return self:token ('number', tonumber (value, 10), "number")
end

function Lexer:get_token ()
::restart::
	self.start = self.c.position
	local ch = self:getc ()

	if not ch then return nil
	elseif ch == '(' then return self:string ()
	elseif ch == '[' then return self:token ('begin_array')
	elseif ch == ']' then return self:token ('end_array')
	elseif ch == '<' then
		-- It seems they ran out of paired characters, yet {} is unused
		ch = self:getc ()
		if not ch then return nil
		elseif ch == '<' then return self:token ('begin_dictionary')
		else
			self:ungetc ()
			return self:string_hex ()
		end
	elseif ch == '>' then
		ch = self:getc ()
		if not ch then return nil
		elseif ch == '>' then return self:token ('end_dictionary')
		else return nil end
	elseif ch == '/' then return self:name ()
	elseif ch == '%' then return self:comment ()
	elseif strchr ("-0123456789.", ch) then return self:number (ch)
	elseif self:eat_newline       (ch) then return self:token ('newline')
	elseif strchr (whitespace,     ch) then goto restart
	else
		-- {} end up being keywords but we should probably error out
		local value = ch
		while true do
			ch = self:getc ()
			if not ch then break
			elseif strchr (whitespace .. delimiters, ch) then
				self:ungetc ()
				break
			end
			value = value .. ch
		end
		if     value == "null" then
			return self:token ('null',    nil,   "null")
		elseif value == "true" then
			return self:token ('boolean', true,  "boolean")
		elseif value == "false" then
			return self:token ('boolean', false, "boolean")
		end
		return self:token ('keyword', value, "keyword")
	end
end

-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

local is_value = function (t)
	return t == 'null' or t == 'boolean' or t == 'name'
		or t == 'number' or t == 'string'
end

-- Retrieve the next thing in the stream, possibly popping values from the stack
local function get_object (lex, stack, deref)
::restart::
	local token = lex:get_token ()
	if token == nil then return nil
	elseif token.type == 'begin_array' then
		local array = {}
		repeat
			local object = get_object (lex, array, deref)
			if not object then error ("array doesn't end") end
			table.insert (array, object)
		until object.type == 'end_array'
		local stop = table.remove (array)
		return { type='array', value=array, start=token.start, stop=stop.stop }
	elseif token.type == 'begin_dictionary' then
		local dict = {}
		repeat
			local object = get_object (lex, dict, deref)
			if not object then error ("dictionary doesn't end") end
			table.insert (dict, object)
		until object.type == 'end_dictionary'
		local stop, kv = table.remove (dict), {}
		if #dict % 2 == 1 then error ("unbalanced dictionary") end
		for i = 1, #dict, 2 do
			local k, v = dict[i], dict[i + 1]
			if k.type ~= 'name' then error ("invalid dictionary key type") end
			kv[k.value] = v
		end
		return { type='dict', value=kv, start=token.start, stop=stop.stop }
	elseif token.type == 'keyword' and token.value == 'stream' then
		if #stack < 1 then error ("no dictionary for stream") end
		local d = table.remove (stack)
		if d.type ~= 'dict' then error ("stream not preceded by dictionary") end

		if not lex:eat_newline (lex:getc ()) then
			error ("'stream' not followed by newline")
		end

		local len = deref (d.value['Length'])
		if not len or len.type ~= 'number' then
			error ("missing stream length")
		end

		local data, stop = lex.c:read (len.value), get_object (lex, {}, deref)
		if not stop or stop.type ~= 'keyword' or stop.value ~= 'endstream' then
			error ("missing 'endstream'")
		end

		return { type='stream', value={ dict=dict, data=data },
			start=token.start, stop=stop.stop }
	elseif token.type == 'keyword' and token.value == 'obj' then
		if #stack < 2 then error ("missing object ID pair") end
		local gen, n = table.remove (stack), table.remove (stack)
		if n.type ~= 'number' or gen.type ~= 'number' then
			error ("object ID pair must be two integers")
		end

		local tmp = {}
		repeat
			local object = get_object (lex, tmp, deref)
			if not object then error ("object doesn't end") end
			table.insert (tmp, object)
		until object.type == 'keyword' and object.value == 'endobj'
		local stop = table.remove (tmp)

		if #tmp ~= 1 then error ("objects must contain exactly one value") end
		local value = table.remove (tmp)
		return { type='object', n=n.value, gen=gen.value, value=value,
			start=n.start, stop=stop.stop }
	elseif token.type == 'keyword' and token.value == 'R' then
		if #stack < 2 then error ("missing reference ID pair") end
		local gen, n = table.remove (stack), table.remove (stack)
		if n.type ~= 'number' or gen.type ~= 'number' then
			error ("reference ID pair must be two integers")
		end
		return { type='reference', value={ n.value, gen.value } }
	elseif token.type == 'newline' or token.type == 'comment' then
		-- These are not objects and our callers aren't interested
		goto restart
	else
		return token
	end
end

-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

local detect = function (c)
	return #c >= 5 and c:read (5) == "%PDF-"
end

local decode_xref_subsection = function (lex, start, count, result)
	if not lex:eat_newline (lex:getc ()) then
		error ("xref subsection must start on a new line")
	end
	for i = 0, count - 1 do
		local entry = lex.c:read (20)
		local off, gen, typ = entry:match
			("^(%d%d%d%d%d%d%d%d%d%d) (%d%d%d%d%d) ([fn])[\r ][\r\n]$")
		if not off then error ("invalid xref entry") end

		-- Translated to the extended XRefStm format
		result[start + i] = {
			t = typ == 'n' and 1 or 0,
			o = math.tointeger (off),
			g = math.tointeger (gen),
		}
	end
end

-- A deref that can't actually resolve anything, for early stages of processing
local deref_nil = function (x)
	if not x or x.type == 'reference' then return nil end
	return x
end

-- Creates a table with named indexes from the trailer and items indexed by
-- object numbers containing { XRefStm fields... }
local decode_xref_normal = function (lex)
	local result = {}
	while true do
		local a = get_object (lex, {}, deref_nil)
		local b = get_object (lex, {}, deref_nil)
		if not a or not b then
			error ("xref section ends too soon")
		elseif a.type == 'number' and b.type == 'number' then
			decode_xref_subsection (lex, a.value, b.value, result)
		elseif a.type == 'keyword' and a.value == 'trailer'
		and b.type == 'dict' then
			for k, v in pairs (b.value) do
				result[k] = v
			end
			return result
		else
			error ("invalid xref contents")
		end
	end
end

local decode_xref_stream = function (lex, stream)
	if stream.dict['Type'] ~= 'XRef' then error ("expected an XRef stream") end

	-- TODO: decode a cross-reference stream from stream.{dict,data};
	--   the compression filter, if present, is always going to be FlateDecode,
	--   which we'll have to import or implement
	-- TODO: take care to also cache cross-reference streams by offset when
	--   they're actually implemented
	error ("cross-reference streams not implemented")
end

local decode_xref = function (c)
	local lex, stack = Lexer:new (c), {}
	while true do
		local object = get_object (lex, stack, deref_nil)
		if object == nil then
			return nil
		elseif object.type == 'keyword' and object.value == 'xref' then
			return decode_xref_normal (lex)
		elseif object.type == 'stream' then
			return decode_xref_stream (lex, object)
		end
		table.insert (stack, object)
	end
end

-- Return all objects found in xref tables as a table indexed by object number,
-- pointing to a list of generations and overwrites, from newest to oldest.
local read_all_xrefs = function (c, start_offset)
	local loaded, result, offset = {}, {}, start_offset
	while true do
		-- Prevent an infinite loop with malicious files
		if loaded[offset] then error ("cyclic cross-reference sections") end

		local xref = decode_xref (c (1 + offset, #c))
		if not xref then break end
		for k, v in pairs (xref) do
			if type (k) == 'number' then
				if not result[k] then result[k] = {} end
				table.insert (result[k], v)
			end
		end
		loaded[offset] = true

		-- TODO: when 'XRefStm' is found, it has precedence over this 'Prev',
		--   and also has its own 'Prev' chain
		local prev = xref['Prev']
		if not prev or prev.type ~= 'number' then break end
		offset = prev.value
	end
	return result
end

local decode = function (c)
	assert (c.position == 1)
	if not detect (c ()) then error ("not a PDF file") end

	-- Look for a pointer to the xref section within the last kibibyte
	-- NOTE: we could probably look backwards for the "trailer" line from here
	--   but we don't know how long the trailer is and we don't want to regex
	--   scan the whole file (ignoring that dictionary contents might, possibly
	--   legally, include the word "trailer" at the beginning of a new line)
	local tail_len = math.min (1024, #c)
	local tail = c (#c - tail_len, #c):read (tail_len)
	local xref_loc = tail:match (".*%sstartxref%s+(%d+)%s+%%%%EOF")
	if not xref_loc then error ("cannot find trailer") end

	-- We need to decode xref sections in order to be able to resolve indirect
	-- references to stream lengths
	local xref = read_all_xrefs (c, math.tointeger (xref_loc))
	local deref

	-- We have to make sure that we don't decode objects twice as that would
	-- duplicate all marks, so we simply cache all objects by offset.
	-- This may be quite the memory load but it seems to be the best thing.
	local cache = {}
	local read_object = function (offset)
		if cache[offset] then return cache[offset] end

		local lex, stack = Lexer:new (c (1 + offset, #c)), {}
		repeat
			local object = get_object (lex, stack, deref)
			if not object then error ("object doesn't end") end
			table.insert (stack, object)
		until object.type == 'object'

		local object = table.remove (stack)
		cache[offset] = object
		c (offset + object.start, offset + object.stop)
			:mark ("object " .. object.n .. " " .. object.gen)
		return object
	end

	-- Resolve an object -- if it's a reference, look it up in "xref",
	-- otherwise just return the object as it was passed
	deref = function (x)
		if not x or x.type ~= 'reference' then return x end
		local n, gen = x.value[1], x.value[2]

		-- TODO: we should also ignore object numbers >= trailer /Size
		local bin = xref[n]
		if not bin then return nil end
		local entry = bin[1]
		if not entry or entry.t ~= 1 or entry.g ~= gen then return nil end

		local object = read_object (entry.o)
		if not object or object.n ~= n or object.gen ~= gen then return nil end
		return object.value
	end

	-- Read all objects accessible from the current version of the document
	for n, bin in pairs (xref) do
		local entry = bin[1]
		if entry and entry.t == 1 then
			read_object (entry.o)
		end
	end

	-- TODO: we should actually try to decode even unreferenced objects.
	--   The problem with decoding content from previous versions of the
	--   document is that we must ignore xref updates from newer versions.
	--   The version information needs to be propagated everywhere.
end

hex.register { type="pdf", detect=detect, decode=decode }
Add a partial decoder for PDF 2017-02-12 20:22:07 +01:00			`--`
			`-- pdf.lua: Portable Document Format`
			`--`
			`-- Based on PDF Reference, version 1.7`
			`-- In practice almost useless, I just wanted to learn about the file format.`
			`-- FIXME: it's also not very robust and doesn't support all documents.`
			`--`
Name change 2020-09-28 05:10:27 +02:00			`-- Copyright (c) 2017, Přemysl Eric Janouch <p@janouch.name>`
Add a partial decoder for PDF 2017-02-12 20:22:07 +01:00			`--`
			`-- Permission to use, copy, modify, and/or distribute this software for any`
Relicense to 0BSD, update mail address I've come to the conclusion that copyright mostly just stands in the way of software development. In my jurisdiction I cannot give up my own copyright and 0BSD seems to be the closest thing to public domain. The updated mail address, also used in my author/committer lines, is shorter and looks nicer. People rarely interact anyway. 2018-06-24 04:11:24 +02:00			`-- purpose with or without fee is hereby granted.`
Add a partial decoder for PDF 2017-02-12 20:22:07 +01:00			`--`
			`-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES`
			`-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF`
			`-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY`
			`-- SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES`
			`-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION`
			`-- OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN`
			`-- CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.`
			`--`

			`local oct_alphabet = "01234567"`
			`local dec_alphabet = "0123456789"`
			`local hex_alphabet = "0123456789abcdefABCDEF"`
			`local whitespace = "\x00\t\n\f\r "`
			`local delimiters = "()<>[]{}/%"`

			`local strchr = function (s, ch) return s:find (ch, 1, true) end`

			`-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`

			`local Lexer = {}`
			`Lexer.__index = Lexer`

			`function Lexer:new (c)`
			`return setmetatable ({ c = c }, self)`
			`end`

			`-- TODO: make it possible to follow a string, we should probably be able to`
			`-- supply callbacks to the constructor, or a wrapper object;`
			`-- this will be used for object streams`
			`function Lexer:getc ()`
			`if self.c.eof then return nil end`
			`return self.c:read (1)`
			`end`

			`function Lexer:ungetc ()`
			`self.c.position = self.c.position - 1`
			`end`

			`function Lexer:token (type, value, description)`
			`if description then`
			`self.c (self.start, self.c.position - 1):mark (description)`
			`end`
			`return { type=type, value=value,`
			`start=self.start, stop=self.c.position - 1 }`
			`end`

			`-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`

			`function Lexer:eat_newline (ch)`
			`if ch == '\r' then`
			`ch = self:getc ()`
			`if ch and ch ~= '\n' then self:ungetc () end`
			`return true`
			`elseif ch == '\n' then`
			`return true`
			`end`
			`end`

			`function Lexer:string ()`
			`local value, level, ch = "", 1`
			`::continue::`
			`while true do`
			`ch = self:getc ()`
			`if not ch then return nil`
			`elseif ch == '\\' then`
			`ch = self:getc ()`
			`if not ch then return nil`
			`elseif ch == 'n' then ch = '\n'`
			`elseif ch == 'r' then ch = '\r'`
			`elseif ch == 't' then ch = '\t'`
			`elseif ch == 'b' then ch = '\b'`
			`elseif ch == 'f' then ch = '\f'`
			`elseif self:eat_newline (ch) then goto continue`
			`elseif strchr (oct_alphabet, ch) then`
			`local buf, i = ch`
			`for i = 1, 2 do`
			`ch = self:getc ()`
			`if not ch then return nil`
			`elseif not strchr (oct_alphabet, ch) then`
			`self:ungetc ()`
			`break`
			`end`
			`buf = buf .. ch`
			`end`
			`ch = string.char (tonumber (buf, 8))`
			`end`
			`elseif self:eat_newline (ch) then`
			`ch = '\n'`
			`elseif ch == '(' then`
			`level = level + 1`
			`elseif ch == ')' then`
			`level = level - 1`
			`if level == 0 then break end`
			`end`
			`value = value .. ch`
			`end`
			`return self:token ('string', value, "string literal")`
			`end`

			`function Lexer:string_hex ()`
			`local value, buf, ch = ""`
			`while true do`
			`ch = self:getc ()`
			`if not ch then return nil`
			`elseif ch == '>' then`
			`break`
			`elseif not strchr (hex_alphabet, ch) then`
			`return nil`
			`elseif buf then`
			`value = value .. string.char (tonumber (buf .. ch, 16))`
			`buf = nil`
			`else`
			`buf = ch`
			`end`
			`end`
			`if buf then value = value .. string.char (tonumber (buf .. '0', 16)) end`
			`return self:token ('string', value, "string hex")`
			`end`

			`function Lexer:name ()`
			`local value, ch = ""`
			`while true do`
			`ch = self:getc ()`
			`if not ch then break`
			`elseif ch == '#' then`
			`local ch1, ch2 = self:getc (), self:getc ()`
			`if not ch1 or not ch2`
			`or not strchr (hex_alphabet, ch1)`
			`or not strchr (hex_alphabet, ch2) then`
			`return nil`
			`end`
			`ch = string.char (tonumber (ch1 .. ch2, 16))`
			`elseif strchr (whitespace .. delimiters, ch) then`
			`self:ungetc ()`
			`break`
			`end`
			`value = value .. ch`
			`end`
			`if value == "" then return nil end`
			`return self:token ('name', value, "name")`
			`end`

			`function Lexer:comment ()`
			`local value, ch = ""`
			`while true do`
			`ch = self:getc ()`
			`if not ch then break`
			`elseif ch == '\r' or ch == '\n' then`
			`self:ungetc ()`
			`break`
			`end`
			`value = value .. ch`
			`end`
			`return self:token ('comment', value, "comment")`
			`end`

			`function Lexer:number (ch)`
			`local value, real, digits = "", false, false`
			`if ch == '-' then`
			`value = ch`
			`ch = self:getc ()`
			`end`
			`while ch do`
			`if strchr (dec_alphabet, ch) then`
			`digits = true`
			`elseif ch == '.' and not real then`
			`real = true`
			`else`
			`self:ungetc ()`
			`break`
			`end`
			`value = value .. ch`
			`ch = self:getc ()`
			`end`
			`-- XXX: perhaps we should instead let it be interpreted as a keyword`
			`if not digits then return nil end`
			`-- XXX: maybe we should differentiate between integers and real values`
			`return self:token ('number', tonumber (value, 10), "number")`
			`end`

			`function Lexer:get_token ()`
			`::restart::`
			`self.start = self.c.position`
			`local ch = self:getc ()`

			`if not ch then return nil`
			`elseif ch == '(' then return self:string ()`
			`elseif ch == '[' then return self:token ('begin_array')`
			`elseif ch == ']' then return self:token ('end_array')`
			`elseif ch == '<' then`
			`-- It seems they ran out of paired characters, yet {} is unused`
			`ch = self:getc ()`
			`if not ch then return nil`
			`elseif ch == '<' then return self:token ('begin_dictionary')`
			`else`
			`self:ungetc ()`
			`return self:string_hex ()`
			`end`
			`elseif ch == '>' then`
			`ch = self:getc ()`
			`if not ch then return nil`
			`elseif ch == '>' then return self:token ('end_dictionary')`
			`else return nil end`
			`elseif ch == '/' then return self:name ()`
			`elseif ch == '%' then return self:comment ()`
			`elseif strchr ("-0123456789.", ch) then return self:number (ch)`
			`elseif self:eat_newline (ch) then return self:token ('newline')`
			`elseif strchr (whitespace, ch) then goto restart`
			`else`
			`-- {} end up being keywords but we should probably error out`
			`local value = ch`
			`while true do`
			`ch = self:getc ()`
			`if not ch then break`
			`elseif strchr (whitespace .. delimiters, ch) then`
			`self:ungetc ()`
			`break`
			`end`
			`value = value .. ch`
			`end`
			`if value == "null" then`
			`return self:token ('null', nil, "null")`
			`elseif value == "true" then`
			`return self:token ('boolean', true, "boolean")`
			`elseif value == "false" then`
			`return self:token ('boolean', false, "boolean")`
			`end`
			`return self:token ('keyword', value, "keyword")`
			`end`
			`end`

			`-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`

			`local is_value = function (t)`
			`return t == 'null' or t == 'boolean' or t == 'name'`
			`or t == 'number' or t == 'string'`
			`end`

			`-- Retrieve the next thing in the stream, possibly popping values from the stack`
			`local function get_object (lex, stack, deref)`
			`::restart::`
			`local token = lex:get_token ()`
			`if token == nil then return nil`
			`elseif token.type == 'begin_array' then`
			`local array = {}`
			`repeat`
			`local object = get_object (lex, array, deref)`
			`if not object then error ("array doesn't end") end`
			`table.insert (array, object)`
			`until object.type == 'end_array'`
			`local stop = table.remove (array)`
			`return { type='array', value=array, start=token.start, stop=stop.stop }`
			`elseif token.type == 'begin_dictionary' then`
			`local dict = {}`
			`repeat`
			`local object = get_object (lex, dict, deref)`
			`if not object then error ("dictionary doesn't end") end`
			`table.insert (dict, object)`
			`until object.type == 'end_dictionary'`
			`local stop, kv = table.remove (dict), {}`
			`if #dict % 2 == 1 then error ("unbalanced dictionary") end`
			`for i = 1, #dict, 2 do`
			`local k, v = dict[i], dict[i + 1]`
			`if k.type ~= 'name' then error ("invalid dictionary key type") end`
			`kv[k.value] = v`
			`end`
			`return { type='dict', value=kv, start=token.start, stop=stop.stop }`
			`elseif token.type == 'keyword' and token.value == 'stream' then`
			`if #stack < 1 then error ("no dictionary for stream") end`
			`local d = table.remove (stack)`
			`if d.type ~= 'dict' then error ("stream not preceded by dictionary") end`

			`if not lex:eat_newline (lex:getc ()) then`
			`error ("'stream' not followed by newline")`
			`end`

			`local len = deref (d.value['Length'])`
			`if not len or len.type ~= 'number' then`
			`error ("missing stream length")`
			`end`

			`local data, stop = lex.c:read (len.value), get_object (lex, {}, deref)`
			`if not stop or stop.type ~= 'keyword' or stop.value ~= 'endstream' then`
			`error ("missing 'endstream'")`
			`end`

			`return { type='stream', value={ dict=dict, data=data },`
			`start=token.start, stop=stop.stop }`
			`elseif token.type == 'keyword' and token.value == 'obj' then`
			`if #stack < 2 then error ("missing object ID pair") end`
			`local gen, n = table.remove (stack), table.remove (stack)`
			`if n.type ~= 'number' or gen.type ~= 'number' then`
			`error ("object ID pair must be two integers")`
			`end`

			`local tmp = {}`
			`repeat`
			`local object = get_object (lex, tmp, deref)`
			`if not object then error ("object doesn't end") end`
			`table.insert (tmp, object)`
			`until object.type == 'keyword' and object.value == 'endobj'`
			`local stop = table.remove (tmp)`

			`if #tmp ~= 1 then error ("objects must contain exactly one value") end`
			`local value = table.remove (tmp)`
			`return { type='object', n=n.value, gen=gen.value, value=value,`
			`start=n.start, stop=stop.stop }`
			`elseif token.type == 'keyword' and token.value == 'R' then`
			`if #stack < 2 then error ("missing reference ID pair") end`
			`local gen, n = table.remove (stack), table.remove (stack)`
			`if n.type ~= 'number' or gen.type ~= 'number' then`
			`error ("reference ID pair must be two integers")`
			`end`
			`return { type='reference', value={ n.value, gen.value } }`
			`elseif token.type == 'newline' or token.type == 'comment' then`
			`-- These are not objects and our callers aren't interested`
			`goto restart`
			`else`
			`return token`
			`end`
			`end`

			`-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`

			`local detect = function (c)`
Handle tiny files gracefully Lua detection functions used to cause fatal errors on failure to read. We could also reconsider treating detection errors as fatal. 2024-12-08 22:36:02 +01:00			`return #c >= 5 and c:read (5) == "%PDF-"`
Add a partial decoder for PDF 2017-02-12 20:22:07 +01:00			`end`

			`local decode_xref_subsection = function (lex, start, count, result)`
			`if not lex:eat_newline (lex:getc ()) then`
			`error ("xref subsection must start on a new line")`
			`end`
			`for i = 0, count - 1 do`
			`local entry = lex.c:read (20)`
			`local off, gen, typ = entry:match`
			`("^(%d%d%d%d%d%d%d%d%d%d) (%d%d%d%d%d) ([fn])[\r ][\r\n]$")`
			`if not off then error ("invalid xref entry") end`

			`-- Translated to the extended XRefStm format`
			`result[start + i] = {`
			`t = typ == 'n' and 1 or 0,`
			`o = math.tointeger (off),`
			`g = math.tointeger (gen),`
			`}`
			`end`
			`end`

			`-- A deref that can't actually resolve anything, for early stages of processing`
			`local deref_nil = function (x)`
			`if not x or x.type == 'reference' then return nil end`
			`return x`
			`end`

			`-- Creates a table with named indexes from the trailer and items indexed by`
			`-- object numbers containing { XRefStm fields... }`
			`local decode_xref_normal = function (lex)`
			`local result = {}`
			`while true do`
			`local a = get_object (lex, {}, deref_nil)`
			`local b = get_object (lex, {}, deref_nil)`
			`if not a or not b then`
			`error ("xref section ends too soon")`
			`elseif a.type == 'number' and b.type == 'number' then`
			`decode_xref_subsection (lex, a.value, b.value, result)`
			`elseif a.type == 'keyword' and a.value == 'trailer'`
			`and b.type == 'dict' then`
			`for k, v in pairs (b.value) do`
			`result[k] = v`
			`end`
			`return result`
			`else`
			`error ("invalid xref contents")`
			`end`
			`end`
			`end`

			`local decode_xref_stream = function (lex, stream)`
			`if stream.dict['Type'] ~= 'XRef' then error ("expected an XRef stream") end`

			`-- TODO: decode a cross-reference stream from stream.{dict,data};`
			`-- the compression filter, if present, is always going to be FlateDecode,`
			`-- which we'll have to import or implement`
			`-- TODO: take care to also cache cross-reference streams by offset when`
			`-- they're actually implemented`
			`error ("cross-reference streams not implemented")`
			`end`

			`local decode_xref = function (c)`
			`local lex, stack = Lexer:new (c), {}`
			`while true do`
			`local object = get_object (lex, stack, deref_nil)`
			`if object == nil then`
			`return nil`
			`elseif object.type == 'keyword' and object.value == 'xref' then`
			`return decode_xref_normal (lex)`
			`elseif object.type == 'stream' then`
			`return decode_xref_stream (lex, object)`
			`end`
			`table.insert (stack, object)`
			`end`
			`end`

			`-- Return all objects found in xref tables as a table indexed by object number,`
			`-- pointing to a list of generations and overwrites, from newest to oldest.`
			`local read_all_xrefs = function (c, start_offset)`
			`local loaded, result, offset = {}, {}, start_offset`
			`while true do`
			`-- Prevent an infinite loop with malicious files`
			`if loaded[offset] then error ("cyclic cross-reference sections") end`

			`local xref = decode_xref (c (1 + offset, #c))`
			`if not xref then break end`
			`for k, v in pairs (xref) do`
			`if type (k) == 'number' then`
			`if not result[k] then result[k] = {} end`
			`table.insert (result[k], v)`
			`end`
			`end`
			`loaded[offset] = true`

			`-- TODO: when 'XRefStm' is found, it has precedence over this 'Prev',`
			`-- and also has its own 'Prev' chain`
			`local prev = xref['Prev']`
			`if not prev or prev.type ~= 'number' then break end`
			`offset = prev.value`
			`end`
			`return result`
			`end`

			`local decode = function (c)`
			`assert (c.position == 1)`
			`if not detect (c ()) then error ("not a PDF file") end`

			`-- Look for a pointer to the xref section within the last kibibyte`
			`-- NOTE: we could probably look backwards for the "trailer" line from here`
			`-- but we don't know how long the trailer is and we don't want to regex`
			`-- scan the whole file (ignoring that dictionary contents might, possibly`
			`-- legally, include the word "trailer" at the beginning of a new line)`
			`local tail_len = math.min (1024, #c)`
			`local tail = c (#c - tail_len, #c):read (tail_len)`
			`local xref_loc = tail:match (".*%sstartxref%s+(%d+)%s+%%%%EOF")`
			`if not xref_loc then error ("cannot find trailer") end`

			`-- We need to decode xref sections in order to be able to resolve indirect`
			`-- references to stream lengths`
			`local xref = read_all_xrefs (c, math.tointeger (xref_loc))`
			`local deref`

			`-- We have to make sure that we don't decode objects twice as that would`
			`-- duplicate all marks, so we simply cache all objects by offset.`
			`-- This may be quite the memory load but it seems to be the best thing.`
			`local cache = {}`
			`local read_object = function (offset)`
			`if cache[offset] then return cache[offset] end`

			`local lex, stack = Lexer:new (c (1 + offset, #c)), {}`
			`repeat`
			`local object = get_object (lex, stack, deref)`
			`if not object then error ("object doesn't end") end`
			`table.insert (stack, object)`
			`until object.type == 'object'`

			`local object = table.remove (stack)`
			`cache[offset] = object`
			`c (offset + object.start, offset + object.stop)`
			`:mark ("object " .. object.n .. " " .. object.gen)`
			`return object`
			`end`

			`-- Resolve an object -- if it's a reference, look it up in "xref",`
			`-- otherwise just return the object as it was passed`
			`deref = function (x)`
			`if not x or x.type ~= 'reference' then return x end`
			`local n, gen = x.value[1], x.value[2]`

			`-- TODO: we should also ignore object numbers >= trailer /Size`
			`local bin = xref[n]`
			`if not bin then return nil end`
			`local entry = bin[1]`
			`if not entry or entry.t ~= 1 or entry.g ~= gen then return nil end`

			`local object = read_object (entry.o)`
			`if not object or object.n ~= n or object.gen ~= gen then return nil end`
			`return object.value`
			`end`

			`-- Read all objects accessible from the current version of the document`
			`for n, bin in pairs (xref) do`
			`local entry = bin[1]`
			`if entry and entry.t == 1 then`
			`read_object (entry.o)`
			`end`
			`end`

			`-- TODO: we should actually try to decode even unreferenced objects.`
			`-- The problem with decoding content from previous versions of the`
			`-- document is that we must ignore xref updates from newer versions.`
			`-- The version information needs to be propagated everywhere.`
			`end`

			`hex.register { type="pdf", detect=detect, decode=decode }`