From 15f62b7054c12bff2899ac6bd73147b6bfc818fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C5=99emysl=20Janouch?= <p.janouch@gmail.com>
Date: Sun, 14 Jul 2013 20:40:58 +0200
Subject: [PATCH] Add a class to handle dictzip files

Provides pseudo-random access to dictionary files compressed using dictzip.

It doesn't implement a cache, it just loads missing chunks until it has the
whole file.  I'm not sure if discarding not recently used chunks is really
a useful feature.  If there _was_ a way to get noticed when system memory
is low, I think the best way to handle that event would be to simply release
it all.

All in all, this is pretty useless.  But it was interesting to write.

This has yet to be integrated into the application proper.
---
 CMakeLists.txt             |  14 +-
 src/dictzip-input-stream.c | 628 +++++++++++++++++++++++++++++++++++++
 src/dictzip-input-stream.h |  77 +++++
 src/stardict.c             |  56 +---
 src/utils.c                |  63 ++++
 src/utils.h                |  44 +++
 6 files changed, 823 insertions(+), 59 deletions(-)
 create mode 100644 src/dictzip-input-stream.c
 create mode 100644 src/dictzip-input-stream.h
 create mode 100644 src/utils.c
 create mode 100644 src/utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a7d682..e2c1373 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,10 +22,12 @@ include (CheckFunctionExists)
 CHECK_FUNCTION_EXISTS ("wcwidth" HAVE_WCWIDTH)
 
 # Dependecies
+find_package (ZLIB REQUIRED)
+
 find_package (PkgConfig REQUIRED)
 pkg_check_modules (dependencies REQUIRED ncursesw glib-2.0 gio-2.0 pango)
 
-include_directories (${dependencies_INCLUDE_DIRS})
+include_directories (${ZLIB_INCLUDE_DIRS} ${dependencies_INCLUDE_DIRS})
 
 # Localization
 find_package (Gettext REQUIRED)
@@ -61,16 +63,20 @@ add_custom_target (docs ALL DEPENDS ${project_MAN_PAGES_OUTPUT})
 
 # Project source files
 set (project_common_sources
+	src/dictzip-input-stream.c
 	src/generator.c
-	src/stardict.c)
+	src/stardict.c
+	src/utils.c)
 set (project_common_headers
 	${CMAKE_CURRENT_BINARY_DIR}/config.h
+	src/dictzip-input-stream.h
 	src/stardict.h
 	src/stardict-private.h
-	src/generator.h)
+	src/generator.h
+	src/utils.h)
 
 # Project libraries
-set (project_common_libraries ${dependencies_LIBRARIES})
+set (project_common_libraries ${ZLIB_LIBRARIES} ${dependencies_LIBRARIES})
 
 # Create a common project library so that source files are only compiled once
 if (${CMAKE_VERSION} VERSION_GREATER "2.8.7")
diff --git a/src/dictzip-input-stream.c b/src/dictzip-input-stream.c
new file mode 100644
index 0000000..e3c0d7c
--- /dev/null
+++ b/src/dictzip-input-stream.c
@@ -0,0 +1,628 @@
+/*
+ * dictzip-input-stream.c: dictzip GIO stream reader
+ *
+ * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include <zlib.h>
+
+#include "utils.h"
+#include "dictzip-input-stream.h"
+
+
+// --- Errors ------------------------------------------------------------------
+
+GQuark
+dictzip_error_quark (void)
+{
+	return g_quark_from_static_string ("dictzip-error-quark");
+}
+
+// --- dictzip utilities -------------------------------------------------------
+
+static void
+free_gzip_header (gz_header *gzh)
+{
+	g_free (gzh->comment);  gzh->comment = NULL;
+	g_free (gzh->extra);    gzh->extra   = NULL;
+	g_free (gzh->name);     gzh->name    = NULL;
+}
+
+/* Reading the header in manually due to stupidity of the ZLIB API. */
+static gboolean
+read_gzip_header (GInputStream *is, gz_header *gzh,
+	goffset *first_block_offset, GError **error)
+{
+	assert (is != NULL);
+	assert (gzh != NULL);
+
+	GDataInputStream *dis = g_data_input_stream_new (is);
+	g_data_input_stream_set_byte_order (dis,
+		G_DATA_STREAM_BYTE_ORDER_LITTLE_ENDIAN);
+	g_filter_input_stream_set_close_base_stream
+		(G_FILTER_INPUT_STREAM (dis), FALSE);
+
+	GError *err = NULL;
+	memset (gzh, 0, sizeof *gzh);
+
+	// File header identification
+	if (g_data_input_stream_read_byte (dis, NULL, &err) != 31
+	 || g_data_input_stream_read_byte (dis, NULL, &err) != 139)
+	{
+		if (err)
+			g_propagate_error (error, err);
+		else
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"wrong header magic");
+		goto error_own;
+	}
+
+	// Compression method, only "deflate" is supported here
+	if (g_data_input_stream_read_byte (dis, NULL, &err) != Z_DEFLATED)
+	{
+		if (err)
+			g_propagate_error (error, err);
+		else
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"unsupported compression method");
+		goto error_own;
+	}
+
+	guint flags = g_data_input_stream_read_byte (dis, NULL, &err);
+	if (err) goto error;
+
+	gzh->text = ((flags & 1) != 0);
+	gzh->hcrc = ((flags & 2) != 0);
+
+	gzh->time = g_data_input_stream_read_uint32 (dis, NULL, &err);
+	if (err) goto error;
+
+	gzh->xflags = g_data_input_stream_read_byte (dis, NULL, &err);
+	if (err) goto error;
+
+	gzh->os = g_data_input_stream_read_byte (dis, NULL, &err);
+	if (err) goto error;
+
+	if (flags & 4)
+	{
+		gzh->extra_len = g_data_input_stream_read_uint16 (dis, NULL, &err);
+		if (err) goto error;
+		gzh->extra_max = gzh->extra_len;
+
+		gzh->extra = g_malloc (gzh->extra_len);
+		gssize read = g_input_stream_read (G_INPUT_STREAM (dis),
+			gzh->extra, gzh->extra_len, NULL, &err);
+		if (err) goto error;
+
+		if (read != gzh->extra_len)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"unexpected end of file");
+			goto error_own;
+		}
+	}
+
+	if (flags & 8)
+	{
+		gzh->name = (Bytef *) stream_read_string (dis, &err);
+		if (err) goto error;
+		gzh->name_max = strlen ((char *) gzh->name) + 1;
+	}
+
+	if (flags & 16)
+	{
+		gzh->comment = (Bytef *) stream_read_string (dis, &err);
+		if (err) goto error;
+		gzh->comm_max = strlen ((char *) gzh->comment) + 1;
+	}
+
+	goffset header_size_sans_crc = g_seekable_tell (G_SEEKABLE (dis));
+
+	if (!gzh->hcrc)
+		*first_block_offset = header_size_sans_crc;
+	else
+	{
+		*first_block_offset = header_size_sans_crc + 2;
+		uLong header_crc = g_data_input_stream_read_uint16 (dis, NULL, &err);
+		if (err) goto error;
+
+		g_seekable_seek (G_SEEKABLE (is), 0, G_SEEK_SET, NULL, &err);
+		if (err) goto error;
+
+		gpointer buf = g_malloc (header_size_sans_crc);
+		g_input_stream_read (is, buf, header_size_sans_crc, NULL, &err);
+		if (err) goto error;
+
+		uLong crc = crc32 (0, NULL, 0);
+		crc = crc32 (crc, buf, header_size_sans_crc);
+		g_free (buf);
+
+		if (header_crc != (guint16) crc)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"header checksum mismatch");
+			goto error_own;
+		}
+	}
+
+	gzh->done = 1;
+	g_object_unref (dis);
+	return TRUE;
+
+error:
+	g_propagate_error (error, err);
+error_own:
+	free_gzip_header (gzh);
+	g_object_unref (dis);
+	return FALSE;
+}
+
+static guint16 *
+read_random_access_field (const gz_header *gzh,
+	gsize *chunk_length, gsize *n_chunks, GError **error)
+{
+	if (!gzh->extra)
+	{
+		g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+			"no 'extra' field within the header");
+		return NULL;
+	}
+
+	guchar *extra_iterator = gzh->extra;
+	guchar *extra_end = gzh->extra + gzh->extra_len;
+
+	guint16 *chunks = NULL;
+
+	while (extra_iterator <= extra_end - 4)
+	{
+		guchar *f = extra_iterator;
+
+		guint16 length = f[2] | (f[3] << 8);
+		extra_iterator += length + 4;
+		if (extra_iterator > extra_end)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"overflowing header subfield");
+			g_free (chunks);
+			return NULL;
+		}
+
+		if (f[0] != 'R' || f[1] != 'A')
+			continue;
+
+		if (chunks != NULL)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"multiple RA subfields present in the header");
+			g_free (chunks);
+			return NULL;
+		}
+
+		guint16 version = f[4] | (f[5] << 8);
+		if (version != 1)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"unsupported RA subfield version");
+			return NULL;
+		}
+
+		*chunk_length = f[6] | (f[7] << 8);
+		if (chunk_length == 0)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"invalid RA chunk length");
+			return NULL;
+		}
+
+		*n_chunks = f[8] | (f[9] << 8);
+		if ((gulong) (extra_iterator - f) < 10 + *n_chunks * 2)
+		{
+			g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+				"RA subfield overflow");
+			return NULL;
+		}
+
+		chunks = g_malloc_n (*n_chunks, sizeof *chunks);
+
+		guint i;
+		for (i = 0; i < *n_chunks; i++)
+			chunks[i] = f[10 + i * 2] + (f[10 + i * 2 + 1] << 8);
+	}
+
+	if (extra_iterator < extra_end - 4)
+	{
+		g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+			"invalid 'extra' field, subfield too short");
+		g_free (chunks);
+		return NULL;
+	}
+
+	return chunks;
+}
+
+// --- DictzipInputStream ------------------------------------------------------
+
+static void dictzip_input_stream_finalize (GObject *gobject);
+
+static void dictzip_input_stream_seekable_init
+	(GSeekableIface *iface, gpointer iface_data);
+static goffset dictzip_input_stream_tell (GSeekable *seekable);
+static gboolean dictzip_input_stream_seek (GSeekable *seekable, goffset offset,
+	GSeekType type, GCancellable *cancellable, GError **error);
+
+static gssize dictzip_input_stream_read (GInputStream *stream, void *buffer,
+	gsize count, GCancellable *cancellable, GError **error);
+static gssize dictzip_input_stream_skip (GInputStream *stream, gsize count,
+	GCancellable *cancellable, GError **error);
+
+struct dictzip_input_stream_private
+{
+	GFileInfo  * file_info;            //!< File information from gzip header
+
+	goffset      first_block_offset;   //!< Offset to the first block/chunk
+	gsize        chunk_length;         //!< Uncompressed chunk length
+	gsize        n_chunks;             //!< Number of chunks in file
+	guint16    * chunks;               //!< Chunk sizes after compression
+
+	z_stream     zs;                   //!< zlib decompression context
+	gpointer     input_buffer;         //!< Input buffer
+
+	goffset      offset;               //!< Current offset
+	gpointer   * decompressed;         //!< Array of decompressed chunks
+	gsize        last_chunk_length;    //!< Size of the last chunk
+};
+
+G_DEFINE_TYPE_EXTENDED (DictzipInputStream, dictzip_input_stream,
+	G_TYPE_FILTER_INPUT_STREAM, 0,
+	G_IMPLEMENT_INTERFACE (G_TYPE_SEEKABLE, dictzip_input_stream_seekable_init))
+
+static gboolean seekable_true  (G_GNUC_UNUSED GSeekable *x) { return TRUE;  }
+static gboolean seekable_false (G_GNUC_UNUSED GSeekable *x) { return FALSE; }
+
+static void
+dictzip_input_stream_seekable_init
+	(GSeekableIface *iface, G_GNUC_UNUSED gpointer iface_data)
+{
+	iface->tell            = dictzip_input_stream_tell;
+	iface->can_seek        = seekable_true;
+	iface->seek            = dictzip_input_stream_seek;
+	iface->can_truncate    = seekable_false;
+}
+
+static void
+dictzip_input_stream_class_init (DictzipInputStreamClass *klass)
+{
+	g_type_class_add_private (klass, sizeof (DictzipInputStreamPrivate));
+
+	GInputStreamClass *stream_class = G_INPUT_STREAM_CLASS (klass);
+	stream_class->read_fn  = dictzip_input_stream_read;
+	stream_class->skip     = dictzip_input_stream_skip;
+
+	GObjectClass *object_class = G_OBJECT_CLASS (klass);
+	object_class->finalize = dictzip_input_stream_finalize;
+}
+
+static void
+dictzip_input_stream_init (DictzipInputStream *self)
+{
+	self->priv = G_TYPE_INSTANCE_GET_PRIVATE (self,
+		DICTZIP_TYPE_INPUT_STREAM, DictzipInputStreamPrivate);
+}
+
+static void
+dictzip_input_stream_finalize (GObject *gobject)
+{
+	DictzipInputStreamPrivate *priv = DICTZIP_INPUT_STREAM (gobject)->priv;
+	g_object_unref (priv->file_info);
+	g_free (priv->chunks);
+	g_free (priv->input_buffer);
+	inflateEnd (&priv->zs);
+
+	guint i;
+	for (i = 0; i < priv->n_chunks; i++)
+		g_free (priv->decompressed[i]);
+	g_free (priv->decompressed);
+
+	G_OBJECT_CLASS (dictzip_input_stream_parent_class)->finalize (gobject);
+}
+
+static goffset
+dictzip_input_stream_tell (GSeekable *seekable)
+{
+	return DICTZIP_INPUT_STREAM (seekable)->priv->offset;
+}
+
+static gpointer
+inflate_chunk (DictzipInputStream *self,
+	guint chunk_id, gsize *inflated_length, GError **error)
+{
+	DictzipInputStreamPrivate *priv = self->priv;
+	g_return_val_if_fail (chunk_id < priv->n_chunks, NULL);
+
+	GInputStream *base_stream = G_FILTER_INPUT_STREAM (self)->base_stream;
+
+	guint i;
+	goffset offset = priv->first_block_offset;
+	for (i = 0; i < chunk_id; i++)
+		offset += priv->chunks[i];
+
+	if (!g_seekable_seek (G_SEEKABLE (base_stream),
+		offset, G_SEEK_SET, NULL, error))
+		return NULL;
+
+	gssize read = g_input_stream_read (base_stream, priv->input_buffer,
+		priv->chunks[chunk_id], NULL, error);
+	if (read == -1)
+		return NULL;
+
+	if (read != priv->chunks[chunk_id])
+	{
+		g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
+			"premature end of file");
+		return NULL;
+	}
+
+	int z_err;
+	gpointer chunk_data = g_malloc (priv->chunk_length);
+
+	priv->zs.next_in   = (Bytef *) priv->input_buffer;
+	priv->zs.avail_in  = read;
+	priv->zs.total_in  = 0;
+
+	priv->zs.next_out  = (Bytef *) chunk_data;
+	priv->zs.avail_out = priv->chunk_length;
+	priv->zs.total_out = 0;
+
+	z_err = inflateReset (&priv->zs);
+	if (z_err != Z_OK)
+		goto error_zlib;
+
+	z_err = inflate (&priv->zs, Z_BLOCK);
+	if (z_err != Z_OK)
+		goto error_zlib;
+
+	*inflated_length = priv->zs.total_out;
+	return chunk_data;
+
+error_zlib:
+	g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
+		"failed to inflate the requested block: %s", zError (z_err));
+	g_free (chunk_data);
+	return NULL;
+}
+
+static gpointer
+get_chunk (DictzipInputStream *self, guint chunk_id, GError **error)
+{
+	DictzipInputStreamPrivate *priv = self->priv;
+	gpointer chunk = priv->decompressed[chunk_id];
+	if (!chunk)
+	{
+		/* Just inflating the file piece by piece as needed. */
+		gsize chunk_size;
+		chunk = inflate_chunk (self, chunk_id, &chunk_size, error);
+		if (!chunk)
+			return NULL;
+
+		if (chunk_id + 1 == priv->n_chunks)
+			priv->last_chunk_length = chunk_size;
+		else if (chunk_size < priv->chunk_length)
+		{
+			g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
+				"inflated dictzip chunk is too short");
+			g_free (chunk);
+			return NULL;
+		}
+
+		priv->decompressed[chunk_id] = chunk;
+	}
+	return chunk;
+}
+
+static gboolean
+dictzip_input_stream_seek (GSeekable *seekable, goffset offset,
+	GSeekType type, GCancellable *cancellable, GError **error)
+{
+	if (g_cancellable_set_error_if_cancelled (cancellable, error))
+		return FALSE;
+
+	if (type == G_SEEK_END)
+	{
+		/* This could be implemented by retrieving the last chunk
+		 * and deducing the filesize, should the functionality be needed. */
+		g_set_error (error, G_IO_ERROR, G_IO_ERROR_NOT_SUPPORTED,
+			"I don't know where the stream ends, cannot seek there");
+		return FALSE;
+	}
+
+	DictzipInputStream *self = DICTZIP_INPUT_STREAM (seekable);
+	goffset new_offset;
+
+	if (type == G_SEEK_SET)
+		new_offset = offset;
+	else if (type == G_SEEK_CUR)
+		new_offset = self->priv->offset + offset;
+	else
+		g_assert_not_reached ();
+
+	if (new_offset < 0)
+	{
+		g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
+			"cannot seek before the start of data");
+		return FALSE;
+	}
+
+	self->priv->offset = new_offset;
+	return TRUE;
+}
+
+static gssize
+dictzip_input_stream_read (GInputStream *stream, void *buffer,
+	gsize count, GCancellable *cancellable, GError **error)
+{
+	if (g_cancellable_set_error_if_cancelled (cancellable, error))
+		return -1;
+
+	DictzipInputStream *self = DICTZIP_INPUT_STREAM (stream);
+	DictzipInputStreamPrivate *priv = self->priv;
+	gssize read = 0;
+
+	guint chunk_id     = priv->offset / priv->chunk_length;
+	guint chunk_offset = priv->offset % priv->chunk_length;
+
+	do
+	{
+		if (chunk_id >= priv->n_chunks)
+			return read;
+
+		gpointer chunk = get_chunk (self, chunk_id, error);
+		if (!chunk)
+			return -1;
+
+		glong to_copy;
+		if (chunk_id + 1 == priv->n_chunks)
+			// Set by the call to get_chunk().
+			to_copy = priv->last_chunk_length - chunk_offset;
+		else
+			to_copy = priv->chunk_length - chunk_offset;
+
+		if (to_copy > (glong) count)
+			to_copy = count;
+
+		if (to_copy > 0)
+		{
+			memcpy (buffer, chunk + chunk_offset, to_copy);
+			buffer += to_copy;
+			priv->offset += to_copy;
+			count -= to_copy;
+			read += to_copy;
+		}
+
+		chunk_id++;
+		chunk_offset = 0;
+	}
+	while (count);
+
+	return read;
+}
+
+static gssize
+dictzip_input_stream_skip (GInputStream *stream, gsize count,
+	GCancellable *cancellable, GError **error)
+{
+	if (!dictzip_input_stream_seek (G_SEEKABLE (stream), count,
+		G_SEEK_CUR, cancellable, error))
+		return -1;
+
+	return count;
+}
+
+/** Create an input stream for the underlying dictzip file. */
+DictzipInputStream *
+dictzip_input_stream_new (GInputStream *base_stream, GError **error)
+{
+	g_return_val_if_fail (G_IS_INPUT_STREAM (base_stream), NULL);
+
+	if (!G_IS_SEEKABLE (base_stream)
+	 || !g_seekable_can_seek (G_SEEKABLE (base_stream)))
+	{
+		g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_NOT_SEEKABLE,
+			"the underlying stream isn't seekable");
+		return NULL;
+	}
+
+	GError *err = NULL;
+	DictzipInputStream *self = g_object_new (DICTZIP_TYPE_INPUT_STREAM,
+		"base-stream", base_stream, "close-base-stream", FALSE, NULL);
+	DictzipInputStreamPrivate *priv = self->priv;
+
+	/* Decode the header. */
+	gz_header gzh;
+	if (!read_gzip_header (G_INPUT_STREAM (base_stream),
+		&gzh, &priv->first_block_offset, &err))
+	{
+		g_propagate_error (error, err);
+		goto error;
+	}
+
+	priv->chunks = read_random_access_field (&gzh,
+		&priv->chunk_length, &priv->n_chunks, &err);
+	if (err)
+	{
+		g_propagate_error (error, err);
+		goto error;
+	}
+
+	if (!priv->chunks)
+	{
+		g_set_error (error, DICTZIP_ERROR, DICTZIP_ERROR_INVALID_HEADER,
+			"not a dictzip file");
+		goto error;
+	}
+
+	/* Store file information. */
+	priv->file_info = g_file_info_new ();
+
+	if (gzh.time != 0)
+	{
+		GTimeVal m_time = { gzh.time, 0 };
+		g_file_info_set_modification_time (priv->file_info, &m_time);
+	}
+
+	if (gzh.name && *gzh.name)
+		g_file_info_set_name (priv->file_info, (gchar *) gzh.name);
+
+	/* Initialise zlib. */
+	int z_err;
+	z_err = inflateInit2 (&priv->zs, -15);
+	if (z_err != Z_OK)
+	{
+		g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
+			"zlib initialisation failed: %s", zError (z_err));
+		goto error;
+	}
+
+	priv->input_buffer = g_malloc (65536);
+	priv->decompressed = g_new0 (gpointer, priv->n_chunks);
+	priv->last_chunk_length = -1; // We don't know yet.
+
+	free_gzip_header (&gzh);
+	return self;
+
+error:
+	free_gzip_header (&gzh);
+	g_object_unref (self);
+	return NULL;
+}
+
+/** Return file information for the compressed file. */
+GFileInfo *
+dictzip_input_stream_get_file_info (DictzipInputStream *self)
+{
+	g_return_val_if_fail (DICTZIP_IS_INPUT_STREAM (self), NULL);
+
+	DictzipInputStreamPrivate *priv = self->priv;
+	return priv->file_info;
+}
diff --git a/src/dictzip-input-stream.h b/src/dictzip-input-stream.h
new file mode 100644
index 0000000..b9d039c
--- /dev/null
+++ b/src/dictzip-input-stream.h
@@ -0,0 +1,77 @@
+/*
+ * dictzip-input-stream.h: dictzip GIO stream reader
+ *
+ * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef DICTZIP_INPUT_STREAM_H
+#define DICTZIP_INPUT_STREAM_H
+
+/** Random-access dictzip reader. */
+typedef struct dictzip_input_stream          DictzipInputStream;
+typedef struct dictzip_input_stream_class    DictzipInputStreamClass;
+typedef struct dictzip_input_stream_private  DictzipInputStreamPrivate;
+
+/* GObject boilerplate. */
+#define DICTZIP_TYPE_INPUT_STREAM  (dictzip_input_stream_get_type ())
+#define DICTZIP_INPUT_STREAM(obj) \
+	(G_TYPE_CHECK_INSTANCE_CAST ((obj), \
+	DICTZIP_TYPE_INPUT_STREAM, DictzipInputStream))
+#define DICTZIP_IS_INPUT_STREAM(obj) \
+	(G_TYPE_CHECK_INSTANCE_TYPE ((obj), \
+	DICTZIP_TYPE_INPUT_STREAM))
+#define DICTZIP_INPUT_STREAM_CLASS(klass) \
+	(G_TYPE_CHECK_CLASS_CAST ((klass), \
+	DICTZIP_TYPE_INPUT_STREAM, DictzipInputStreamClass))
+#define DICTZIP_IS_INPUT_STREAM_CLASS(klass) \
+	(G_TYPE_CHECK_CLASS_TYPE ((klass), \
+	DICTZIP_TYPE_INPUT_STREAM))
+#define DICTZIP_INPUT_STREAM_GET_CLASS(obj) \
+	(G_TYPE_INSTANCE_GET_CLASS ((obj), \
+	DICTZIP_TYPE_INPUT_STREAM, DictzipInputStreamClass))
+
+// --- Errors ------------------------------------------------------------------
+
+typedef enum {
+	DICTZIP_ERROR_NOT_SEEKABLE,        //!< Underlying stream isn't seekable
+	DICTZIP_ERROR_INVALID_HEADER       //!< Error occured while parsing header
+} DictzipError;
+
+#define DICTZIP_ERROR  (dictzip_error_quark ())
+
+GQuark dictzip_error_quark (void);
+
+// --- DictzipInputStream ------------------------------------------------------
+
+struct dictzip_input_stream
+{
+	GFilterInputStream parent_instance;
+	DictzipInputStreamPrivate *priv;
+};
+
+struct dictzip_input_stream_class
+{
+	GFilterInputStreamClass parent_class;
+};
+
+GType dictzip_input_stream_get_type (void);
+DictzipInputStream *dictzip_input_stream_new
+	(GInputStream *base_stream, GError **error);
+GFileInfo *dictzip_input_stream_get_file_info (DictzipInputStream *self);
+
+
+#endif /* ! DICTZIP_INPUT_STREAM_H */
diff --git a/src/stardict.c b/src/stardict.c
index 4e3f5bd..9a25b3e 100644
--- a/src/stardict.c
+++ b/src/stardict.c
@@ -29,48 +29,11 @@
 
 #include "stardict.h"
 #include "stardict-private.h"
+#include "utils.h"
 
 
 // --- Utilities ---------------------------------------------------------------
 
-/** Read the whole stream into a byte array. */
-static gboolean
-stream_read_all (GByteArray *ba, GInputStream *is, GError **error)
-{
-	guint8 buffer[1024 * 64];
-	gsize bytes_read;
-
-	while (g_input_stream_read_all (is, buffer, sizeof buffer,
-		&bytes_read, NULL, error))
-	{
-		g_byte_array_append (ba, buffer, bytes_read);
-		if (bytes_read < sizeof buffer)
-			return TRUE;
-	}
-	return FALSE;
-}
-
-/** Read a null-terminated string from a data input stream. */
-static gchar *
-stream_read_string (GDataInputStream *dis, GError **error)
-{
-	gsize length;
-	gchar *s = g_data_input_stream_read_upto (dis, "", 1, &length, NULL, error);
-	if (!s)
-		return NULL;
-
-	GError *err = NULL;
-	g_data_input_stream_read_byte (dis, NULL, &err);
-	if (err)
-	{
-		g_free (s);
-		g_propagate_error (error, err);
-		return NULL;
-	}
-
-	return s;
-}
-
 /** String compare function used for StarDict indexes. */
 static inline gint
 stardict_strcmp (const gchar *s1, const gchar *s2)
@@ -79,23 +42,6 @@ stardict_strcmp (const gchar *s1, const gchar *s2)
 	return a ? a : strcmp (s1, s2);
 }
 
-/** After this statement, the element has been found and its index is stored
- *  in the variable "imid". */
-#define BINARY_SEARCH_BEGIN(max, compare)                                     \
-	gint imin = 0, imax = max, imid;                                          \
-	while (imin <= imax) {                                                    \
-		imid = imin + (imax - imin) / 2;                                      \
-		gint cmp = compare;                                                   \
-		if      (cmp > 0) imin = imid + 1;                                    \
-		else if (cmp < 0) imax = imid - 1;                                    \
-		else {
-
-/** After this statement, the binary search has failed and "imin" stores
- *  the position where the element can be inserted. */
-#define BINARY_SEARCH_END                                                     \
-		}                                                                     \
-	}
-
 // --- Errors ------------------------------------------------------------------
 
 GQuark
diff --git a/src/utils.c b/src/utils.c
new file mode 100644
index 0000000..8636778
--- /dev/null
+++ b/src/utils.c
@@ -0,0 +1,63 @@
+/*
+ * utils.c: miscellaneous utilities
+ *
+ * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include "utils.h"
+
+
+/** Read the whole stream into a byte array. */
+gboolean
+stream_read_all (GByteArray *ba, GInputStream *is, GError **error)
+{
+	guint8 buffer[1024 * 64];
+	gsize bytes_read;
+
+	while (g_input_stream_read_all (is, buffer, sizeof buffer,
+		&bytes_read, NULL, error))
+	{
+		g_byte_array_append (ba, buffer, bytes_read);
+		if (bytes_read < sizeof buffer)
+			return TRUE;
+	}
+	return FALSE;
+}
+
+/** Read a null-terminated string from a data input stream. */
+gchar *
+stream_read_string (GDataInputStream *dis, GError **error)
+{
+	gsize length;
+	gchar *s = g_data_input_stream_read_upto (dis, "", 1, &length, NULL, error);
+	if (!s)
+		return NULL;
+
+	GError *err = NULL;
+	g_data_input_stream_read_byte (dis, NULL, &err);
+	if (err)
+	{
+		g_free (s);
+		g_propagate_error (error, err);
+		return NULL;
+	}
+
+	return s;
+}
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000..61c108e
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,44 @@
+/*
+ * utils.h: miscellaneous utilities
+ *
+ * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef UTILS_H
+#define UTILS_H
+
+/** After this statement, the element has been found and its index is stored
+ *  in the variable "imid". */
+#define BINARY_SEARCH_BEGIN(max, compare)                                     \
+	gint imin = 0, imax = max, imid;                                          \
+	while (imin <= imax) {                                                    \
+		imid = imin + (imax - imin) / 2;                                      \
+		gint cmp = compare;                                                   \
+		if      (cmp > 0) imin = imid + 1;                                    \
+		else if (cmp < 0) imax = imid - 1;                                    \
+		else {
+
+/** After this statement, the binary search has failed and "imin" stores
+ *  the position where the element can be inserted. */
+#define BINARY_SEARCH_END                                                     \
+		}                                                                     \
+	}
+
+gboolean stream_read_all (GByteArray *ba, GInputStream *is, GError **error);
+gchar *stream_read_string (GDataInputStream *dis, GError **error);
+
+#endif /* ! UTILS_H */