fiv/tools/info.c

287 lines
8.0 KiB
C

//
// info.c: acquire information about JPEG/TIFF/BMFF/WebP files in JSON format
//
// Copyright (c) 2021 - 2023, Přemysl Eric Janouch <p@janouch.name>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
#include "info.h"
#include <jv.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// --- ISO/IEC base media file format ------------------------------------------
// ISO/IEC 14496-12:2015(E), used to be publicly available, now there's only:
// https://mpeg.chiariglione.org/standards/mpeg-4/iso-base-media-file-format/text-isoiec-14496-12-5th-edition
// but people have managed to archive the final version as well:
// https://b.goeswhere.com/ISO_IEC_14496-12_2015.pdf
//
// ISO/IEC 23008-12:2017 Information technology -
// High efficiency coding and media delivery in heterogeneous environments -
// Part 12: Image File Format + Cor 1:2020 Technical Corrigendum 1
// https://standards.iso.org/ittf/PubliclyAvailableStandards/
static jv
parse_bmff_box(jv o, const char *type, const uint8_t *data, size_t len)
{
// TODO(p): Parse out "uuid"'s uint8_t[16] initial field, present as hex.
// TODO(p): Parse out "ftyp" contents: 14496-12:2015 4.3
// TODO(p): Parse out other important boxes: 14496-12:2015 8+
return add_to_subarray(o, "boxes", jv_string(type));
}
static bool
detect_bmff(const uint8_t *p, size_t len)
{
// 4.2 Object Structure--this box need not be present, nor at the beginning
// TODO(p): What does `aligned(8)` mean? It's probably in bits.
return len >= 8 && !memcmp(p + 4, "ftyp", 4);
}
static jv
parse_bmff(jv o, const uint8_t *p, size_t len)
{
if (!detect_bmff(p, len))
return add_error(o, "not BMFF at all or unsupported");
const uint8_t *end = p + len;
while (p < end) {
if (end - p < 8) {
o = add_warning(o, "box framing mismatch");
break;
}
char type[5] = "";
memcpy(type, p + 4, 4);
uint64_t box_size = u32be(p);
const uint8_t *data = p + 8;
if (box_size == 1) {
if (end - p < 16) {
o = add_warning(o, "unexpected EOF");
break;
}
box_size = u64be(data);
data += 8;
} else if (!box_size)
box_size = end - p;
if (box_size > (uint64_t) (end - p)) {
o = add_warning(o, "unexpected EOF");
break;
}
size_t data_len = box_size - (data - p);
o = parse_bmff_box(o, type, data, data_len);
p += box_size;
}
return o;
}
// --- WebP --------------------------------------------------------------------
// libwebp won't let us simply iterate over all chunks, so handroll it.
//
// https://github.com/webmproject/libwebp/blob/master/doc/webp-container-spec.txt
// https://github.com/webmproject/libwebp/blob/master/doc/webp-lossless-bitstream-spec.txt
// https://datatracker.ietf.org/doc/html/rfc6386
//
// Pretty versions, hopefully not outdated:
// https://developers.google.com/speed/webp/docs/riff_container
// https://developers.google.com/speed/webp/docs/webp_lossless_bitstream_specification
static bool
detect_webp(const uint8_t *p, size_t len)
{
return len >= 12 && !memcmp(p, "RIFF", 4) && !memcmp(p + 8, "WEBP", 4);
}
static jv
parse_webp_vp8(jv o, const uint8_t *p, size_t len)
{
if (len < 10 || (p[0] & 1) != 0 /* key frame */ ||
p[3] != 0x9d || p[4] != 0x01 || p[5] != 0x2a) {
return add_warning(o, "invalid VP8 chunk");
}
o = jv_set(o, jv_string("width"), jv_number(u16le(p + 6) & 0x3fff));
o = jv_set(o, jv_string("height"), jv_number(u16le(p + 8) & 0x3fff));
return o;
}
static jv
parse_webp_vp8l(jv o, const uint8_t *p, size_t len)
{
if (len < 5 || p[0] != 0x2f)
return add_warning(o, "invalid VP8L chunk");
// Reading LSB-first from a little endian value means reading in order.
uint32_t header = u32le(p + 1);
o = jv_set(o, jv_string("width"), jv_number((header & 0x3fff) + 1));
header >>= 14;
o = jv_set(o, jv_string("height"), jv_number((header & 0x3fff) + 1));
header >>= 14;
o = jv_set(o, jv_string("alpha_is_used"), jv_bool(header & 1));
return o;
}
static jv
parse_webp_vp8x(jv o, const uint8_t *p, size_t len)
{
if (len < 10)
return add_warning(o, "invalid VP8X chunk");
// Most of the fields in this chunk are duplicate or inferrable.
// Probably not worth decoding or verifying.
// TODO(p): For animations, we need to use the width and height from here.
uint8_t flags = p[0];
o = jv_set(o, jv_string("animation"), jv_bool((flags >> 1) & 1));
return o;
}
static jv
parse_webp(jv o, const uint8_t *p, size_t len)
{
if (!detect_webp(p, len))
return add_error(o, "not a WEBP file");
// TODO(p): This can still be parseable.
// TODO(p): Warn on trailing data.
uint32_t size = u32le(p + 4);
if (8 + size < len)
return add_error(o, "truncated file");
const uint8_t *end = p + 8 + size;
p += 12;
jv chunks = jv_array();
while (p < end) {
if (end - p < 8) {
o = add_warning(o, "framing mismatch");
printf("%ld", end - p);
break;
}
uint32_t chunk_size = u32le(p + 4);
uint32_t chunk_advance = (chunk_size + 1) & ~1;
if (p + 8 + chunk_advance > end) {
o = add_warning(o, "runaway chunk payload");
break;
}
char fourcc[5] = "";
memcpy(fourcc, p, 4);
chunks = jv_array_append(chunks, jv_string(fourcc));
p += 8;
// TODO(p): Decode more chunks.
if (!strcmp(fourcc, "VP8 "))
o = parse_webp_vp8(o, p, chunk_size);
if (!strcmp(fourcc, "VP8L"))
o = parse_webp_vp8l(o, p, chunk_size);
if (!strcmp(fourcc, "VP8X"))
o = parse_webp_vp8x(o, p, chunk_size);
if (!strcmp(fourcc, "EXIF"))
o = parse_exif(o, p, chunk_size);
if (!strcmp(fourcc, "ICCP"))
o = parse_icc(o, p, chunk_size);
p += chunk_advance;
}
return jv_set(o, jv_string("chunks"), chunks);
}
// --- I/O ---------------------------------------------------------------------
static struct {
const char *name;
bool (*detect) (const uint8_t *, size_t);
jv (*parse) (jv, const uint8_t *, size_t);
} formats[] = {
{"JPEG", detect_jpeg, parse_jpeg},
{"TIFF", detect_tiff, parse_tiff},
{"BMFF", detect_bmff, parse_bmff},
{"WebP", detect_webp, parse_webp},
};
static jv
parse_any(jv o, const uint8_t *p, size_t len)
{
// TODO(p): Also see if the file extension is appropriate.
for (size_t i = 0; i < sizeof formats / sizeof *formats; i++) {
if (!formats[i].detect(p, len))
continue;
if (getenv("INFO_IDENTIFY"))
o = jv_set(o, jv_string("format"), jv_string(formats[i].name));
return formats[i].parse(o, p, len);
}
return add_error(o, "unsupported file format");
}
static jv
do_file(const char *filename, jv o)
{
const char *err = NULL;
FILE *fp = fopen(filename, "rb");
if (!fp) {
err = strerror(errno);
goto error;
}
uint8_t *data = NULL, buf[256 << 10];
size_t n, len = 0;
while ((n = fread(buf, sizeof *buf, sizeof buf / sizeof *buf, fp))) {
data = realloc(data, len + n);
memcpy(data + len, buf, n);
len += n;
}
if (ferror(fp)) {
err = strerror(errno);
goto error_read;
}
#if 0
// Not sure if I want to ensure their existence...
o = jv_object_set(o, jv_string("info"), jv_array());
o = jv_object_set(o, jv_string("warnings"), jv_array());
#endif
o = parse_any(o, data, len);
error_read:
fclose(fp);
free(data);
error:
if (err)
o = add_error(o, err);
return o;
}
int
main(int argc, char *argv[])
{
// XXX: Can't use `xargs -P0`, there's a risk of non-atomic writes.
// Usage: find . -print0 | xargs -0 ./info
for (int i = 1; i < argc; i++) {
const char *filename = argv[i];
jv o = jv_object();
o = jv_object_set(o, jv_string("filename"), jv_string(filename));
o = do_file(filename, o);
jv_dumpf(o, stdout, 0 /* JV_PRINT_SORTED would discard information. */);
fputc('\n', stdout);
}
return 0;
}