Add a deep tagger in C++

2024-01-07 23:26:05 +01:00
parent 054078908a
commit b4f28814b7
7 changed files with 927 additions and 1 deletions
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-Copyright (c) 2023, Přemysl Eric Janouch <p@janouch.name>
+Copyright (c) 2023 - 2024, Přemysl Eric Janouch <p@janouch.name>
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted.
--- a/deeptagger/CMakeLists.txt
+++ b/deeptagger/CMakeLists.txt
@@ -0,0 +1,20 @@
 # Ubuntu 20.04 LTS
 cmake_minimum_required (VERSION 3.16)
 project (deeptagger VERSION 0.0.1 LANGUAGES CXX)
 # Hint: set ONNXRuntime_ROOT to a directory with a pre-built GitHub release.
 # (Useful for development, otherwise you may need to adjust the rpath.)
 set (CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}")
 find_package (ONNXRuntime REQUIRED)
 find_package (PkgConfig REQUIRED)
 pkg_check_modules (GM REQUIRED GraphicsMagick++)
 add_executable (deeptagger deeptagger.cpp)
 target_compile_features (deeptagger PRIVATE cxx_std_17)
 target_include_directories (deeptagger PRIVATE
 	${GM_INCLUDE_DIRS} ${ONNXRuntime_INCLUDE_DIRS})
 target_link_directories (deeptagger PRIVATE
 	${GM_LIBRARY_DIRS})
 target_link_libraries (deeptagger PRIVATE
 	${GM_LIBRARIES} ${ONNXRuntime_LIBRARIES})
--- a/deeptagger/FindONNXRuntime.cmake
+++ b/deeptagger/FindONNXRuntime.cmake
@@ -0,0 +1,11 @@
 # Public Domain
 find_path (ONNXRuntime_INCLUDE_DIRS onnxruntime_c_api.h
 	PATH_SUFFIXES onnxruntime)
 find_library (ONNXRuntime_LIBRARIES NAMES onnxruntime)
 include (FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS (ONNXRuntime DEFAULT_MSG
 	ONNXRuntime_INCLUDE_DIRS ONNXRuntime_LIBRARIES)
 mark_as_advanced (ONNXRuntime_LIBRARIES ONNXRuntime_INCLUDE_DIRS)
--- a/deeptagger/README.adoc
+++ b/deeptagger/README.adoc
@@ -0,0 +1,25 @@
 deeptagger
 ==========
 This is an automatic image tagger/classifier written in C++,
 without using any Python, and primarily targets various anime models.
 Unfortunately, you will still need Python and some luck to prepare the models,
 achieved by running download.sh.  You will need about 20 gigabytes of space.
 Very little effort is made to make this work on non-Unix systems.
 Getting this to work
 --------------------
 To build the evaluator, install a C++ compiler, CMake, and development packages
 of GraphicsMagick and ONNX Runtime.
 Prebuilt ONNX Runtime can be most conveniently downloaded from
 https://github.com/microsoft/onnxruntime/releases[GitHub releases].
 Remember to install CUDA packages, such as _nvidia-cudnn_ on Debian,
 if you plan on using the GPU-enabled options.
 $ cmake -DONNXRuntime_ROOT=/path/to/onnxruntime -B build
 $ cmake --build build
 $ ./download.sh
 $ build/deeptagger models/deepdanbooru-v3-20211112-sgd-e28.model image.jpg
--- a/deeptagger/bench.sh
+++ b/deeptagger/bench.sh
@@ -0,0 +1,38 @@
 #!/bin/sh -e
 if [ $# -lt 2 ] || ! [ -x "$1" ]
 then
 	echo "Usage: $0 DEEPTAGGER FILE..."
 	echo "Run this after using download.sh, from the same directory."
 	exit 1
 fi
 runner=$1
 shift
 log=bench.out
 : >$log
 run() {
 	opts=$1 batch=$2 model=$3
 	shift 3
 	for i in $(seq 1 3)
 	do
 		start=$(date +%s)
 		"$runner" $opts -b "$batch" -t 0.75 "$model" "$@" >/dev/null || :
 		end=$(date +%s)
 		printf '%s\t%s\t%s\t%s\t%s\n' \
 			"$name" "$model" "$opts" "$batch" "$((end - start))" | tee -a $log
 	done
 }
 for model in models/*.model
 do
 	name=$(sed -n 's/^name=//p' "$model")
 	run ""     1 "$model" "$@"
 	run ""     4 "$model" "$@"
 	run ""    16 "$model" "$@"
 	run --cpu  1 "$model" "$@"
 	run --cpu  4 "$model" "$@"
 	run --cpu 16 "$model" "$@"
 done
--- a/deeptagger/deeptagger.cpp
+++ b/deeptagger/deeptagger.cpp
@@ -0,0 +1,671 @@
 #include <getopt.h>
 #include <Magick++.h>
 #include <onnxruntime_cxx_api.h>
 #ifdef __APPLE__
 #include <coreml_provider_factory.h>
 #endif
 #include <algorithm>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <regex>
 #include <set>
 #include <stdexcept>
 #include <string>
 #include <tuple>
 #include <cstdio>
 #include <cstdint>
 #include <climits>
 static struct {
 	bool cpu = false;
 	int debug = 0;
 	long batch = 1;
 	float threshold = 0.1;
 	// Execution provider name → Key → Value
 	std::map<std::string, std::map<std::string, std::string>> options;
 } g;
 // --- Configuration -----------------------------------------------------------
 // Arguably, input normalization could be incorporated into models instead.
 struct Config {
 	std::string name;
 	enum class Shape {NHWC, NCHW} shape = Shape::NHWC;
 	enum class Channels {RGB, BGR} channels = Channels::RGB;
 	bool normalize = false;
 	enum class Pad {WHITE, EDGE, STRETCH} pad = Pad::WHITE;
 	int size = -1;
 	bool sigmoid = false;
 	std::vector<std::string> tags;
 };
 static void
 read_tags(const std::string &path, std::vector<std::string> &tags)
 {
 	std::ifstream f(path);
 	f.exceptions(std::ifstream::badbit);
 	if (!f)
 		throw std::runtime_error("cannot read tags");
 	std::string line;
 	while (std::getline(f, line)) {
 		if (!line.empty() && line.back() == '\r')
 			line.erase(line.size() - 1);
 		tags.push_back(line);
 	}
 }
 static void
 read_field(Config &config, std::string key, std::string value)
 {
 	if (key == "name") {
 		config.name = value;
 	} else if (key == "shape") {
 		if      (value == "nhwc")    config.shape = Config::Shape::NHWC;
 		else if (value == "nchw")    config.shape = Config::Shape::NCHW;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "channels") {
 		if      (value == "rgb")     config.channels = Config::Channels::RGB;
 		else if (value == "bgr")     config.channels = Config::Channels::BGR;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "normalize") {
 		if      (value == "true")    config.normalize = true;
 		else if (value == "false")   config.normalize = false;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "pad") {
 		if      (value == "white")   config.pad = Config::Pad::WHITE;
 		else if (value == "edge")    config.pad = Config::Pad::EDGE;
 		else if (value == "stretch") config.pad = Config::Pad::STRETCH;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "size") {
 		config.size = std::stoi(value);
 	} else if (key == "interpret") {
 		if      (value == "false")   config.sigmoid = false;
 		else if (value == "sigmoid") config.sigmoid = true;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else {
 		throw std::invalid_argument("unsupported config key: " + key);
 	}
 }
 static void
 read_config(Config &config, const char *path)
 {
 	std::ifstream f(path);
 	f.exceptions(std::ifstream::badbit);
 	if (!f)
 		throw std::runtime_error("cannot read configuration");
 	std::regex re(R"(^\s*([^#=]+?)\s*=\s*([^#]*?)\s*(?:#|$))",
 		std::regex::optimize);
 	std::smatch m;
 	std::string line;
 	while (std::getline(f, line)) {
 		if (std::regex_match(line, m, re))
 			read_field(config, m[1].str(), m[2].str());
 	}
 	read_tags(
 		std::filesystem::path(path).replace_extension("tags"), config.tags);
 }
 // --- Data preparation --------------------------------------------------------
 static float *
 image_to_nhwc(float *data, Magick::Image &image, Config::Channels channels)
 {
 	unsigned int width = image.columns();
 	unsigned int height = image.rows();
 	auto pixels = image.getConstPixels(0, 0, width, height);
 	switch (channels) {
 	case Config::Channels::RGB:
 		for (unsigned int y = 0; y < height; y++) {
 			for (unsigned int x = 0; x < width; x++) {
 				auto pixel = *pixels++;
 				*data++ = ScaleQuantumToChar(pixel.red);
 				*data++ = ScaleQuantumToChar(pixel.green);
 				*data++ = ScaleQuantumToChar(pixel.blue);
 			}
 		}
 		break;
 	case Config::Channels::BGR:
 		for (unsigned int y = 0; y < height; y++) {
 			for (unsigned int x = 0; x < width; x++) {
 				auto pixel = *pixels++;
 				*data++ = ScaleQuantumToChar(pixel.blue);
 				*data++ = ScaleQuantumToChar(pixel.green);
 				*data++ = ScaleQuantumToChar(pixel.red);
 			}
 		}
 	}
 	return data;
 }
 static float *
 image_to_nchw(float *data, Magick::Image &image, Config::Channels channels)
 {
 	unsigned int width = image.columns();
 	unsigned int height = image.rows();
 	auto pixels = image.getConstPixels(0, 0, width, height), pp = pixels;
 	switch (channels) {
 	case Config::Channels::RGB:
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).red);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).green);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).blue);
 		break;
 	case Config::Channels::BGR:
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).blue);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).green);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).red);
 	}
 	return data;
 }
 static Magick::Image
 load(const std::string filename,
 	const Config &config, int64_t width, int64_t height)
 {
 	Magick::Image image;
 	try {
 		image.read(filename);
 	} catch (const Magick::Warning &warning) {
 		if (g.debug)
 			fprintf(stderr, "%s: %s\n", filename.c_str(), warning.what());
 	}
 	image.autoOrient();
 	Magick::Geometry adjusted(width, height);
 	switch (config.pad) {
 	case Config::Pad::EDGE:
 	case Config::Pad::WHITE:
 		adjusted.greater(true);
 		break;
 	case Config::Pad::STRETCH:
 		adjusted.aspect(false);
 	}
 	image.resize(adjusted, Magick::LanczosFilter);
 	// The GraphicsMagick API doesn't offer any good options.
 	if (config.pad == Config::Pad::EDGE) {
 		MagickLib::SetImageVirtualPixelMethod(
 			image.image(), MagickLib::EdgeVirtualPixelMethod);
 		auto x = (int64_t(image.columns()) - width) / 2;
 		auto y = (int64_t(image.rows()) - height) / 2;
 		auto source = image.getConstPixels(x, y, width, height);
 		std::vector<MagickLib::PixelPacket>
 			pixels(source, source + width * height);
 		Magick::Image edged(Magick::Geometry(width, height), "black");
 		edged.classType(Magick::DirectClass);
 		auto target = edged.setPixels(0, 0, width, height);
 		memcpy(target, pixels.data(), pixels.size() * sizeof pixels[0]);
 		edged.syncPixels();
 		image = edged;
 	}
 	// Center it in a square patch of white, removing any transparency.
 	// image.extent() could probably be used to do the same thing.
 	Magick::Image white(Magick::Geometry(width, height), "white");
 	auto x = (white.columns() - image.columns()) / 2;
 	auto y = (white.rows() - image.rows()) / 2;
 	white.composite(image, x, y, Magick::OverCompositeOp);
 	white.fileName(filename);
 	if (g.debug > 2)
 		white.display();
 	return white;
 }
 // --- Inference ---------------------------------------------------------------
 static void
 run(std::vector<Magick::Image> &images, const Config &config,
 	Ort::Session &session, std::vector<int64_t> shape)
 {
 	auto batch = shape[0] = images.size();
 	Ort::AllocatorWithDefaultOptions allocator;
 	auto tensor = Ort::Value::CreateTensor<float>(
 		allocator, shape.data(), shape.size());
 	auto input_len = tensor.GetTensorTypeAndShapeInfo().GetElementCount();
 	auto input_data = tensor.GetTensorMutableData<float>(), pi = input_data;
 	for (int64_t i = 0; i < batch; i++) {
 		switch (config.shape) {
 		case Config::Shape::NCHW:
 			pi = image_to_nchw(pi, images.at(i), config.channels);
 			break;
 		case Config::Shape::NHWC:
 			pi = image_to_nhwc(pi, images.at(i), config.channels);
 		}
 	}
 	if (config.normalize) {
 		pi = input_data;
 		for (size_t i = 0; i < input_len; i++)
 			*pi++ /= 255.0;
 	}
 	std::string input_name =
 		session.GetInputNameAllocated(0, allocator).get();
 	std::string output_name =
 		session.GetOutputNameAllocated(0, allocator).get();
 	std::vector<const char *> input_names = {input_name.c_str()};
 	std::vector<const char *> output_names = {output_name.c_str()};
 	auto outputs = session.Run(Ort::RunOptions{},
 		input_names.data(), &tensor, input_names.size(),
 		output_names.data(), output_names.size());
 	if (outputs.size() != 1 || !outputs[0].IsTensor()) {
 		fprintf(stderr, "Wrong output\n");
 		return;
 	}
 	auto output_len = outputs[0].GetTensorTypeAndShapeInfo().GetElementCount();
 	auto output_data = outputs.front().GetTensorData<float>(), po = output_data;
 	if (output_len != batch * config.tags.size()) {
 		fprintf(stderr, "Tags don't match the output\n");
 		return;
 	}
 	for (size_t i = 0; i < batch; i++) {
 		for (size_t t = 0; t < config.tags.size(); t++) {
 			float value = *po++;
 			if (config.sigmoid)
 				value = 1 / (1 + std::exp(-value));
 			if (value > g.threshold) {
 				printf("%s\t%.2f\t%s\n", images.at(i).fileName().c_str(),
 					value, config.tags.at(t).c_str());
 			}
 		}
 	}
 }
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 static void
 parse_options(const std::string &options)
 {
 	auto semicolon = options.find(";");
 	auto name = options.substr(0, semicolon);
 	auto sequence = options.substr(semicolon);
 	std::map<std::string, std::string> kv;
 	std::regex re(R"(;*([^;=]+)=([^;=]+))", std::regex::optimize);
 	std::sregex_iterator it(sequence.begin(), sequence.end(), re), end;
 	for (; it != end; ++it)
 		kv[it->str(1)] = it->str(2);
 	g.options.insert_or_assign(name, std::move(kv));
 }
 static std::tuple<std::vector<const char *>, std::vector<const char *>>
 unpack_options(const std::string &provider)
 {
 	std::vector<const char *> keys, values;
 	if (g.options.count(provider)) {
 		for (const auto &kv : g.options.at(provider)) {
 			keys.push_back(kv.first.c_str());
 			values.push_back(kv.second.c_str());
 		}
 	}
 	return {keys, values};
 }
 static void
 add_providers(Ort::SessionOptions &options)
 {
 	auto api = Ort::GetApi();
 	auto v_providers = Ort::GetAvailableProviders();
 	std::set<std::string> providers(v_providers.begin(), v_providers.end());
 	if (g.debug) {
 		printf("Providers:");
 		for (const auto &it : providers)
 			printf(" %s", it.c_str());
 		printf("\n");
 	}
 	// There is a string-based AppendExecutionProvider() method,
 	// but it cannot be used with all providers.
 	// TODO: Make it possible to disable providers.
 	// TODO: Providers will deserve some performance tuning.
 	if (g.cpu)
 		return;
 #ifdef __APPLE__
 	if (providers.count("CoreMLExecutionProvider")) {
 		try {
 			Ort::ThrowOnError(
 				OrtSessionOptionsAppendExecutionProvider_CoreML(options, 0));
 		} catch (const std::exception &e) {
 			fprintf(stderr, "CoreML unavailable: %s\n", e.what());
 		}
 	}
 #endif
 #if TENSORRT
 	// TensorRT should be the more performant execution provider, however:
 	//  - it is difficult to set up (needs logging in to download),
 	//  - with WD v1.4 ONNX models, one gets "Your ONNX model has been generated
 	//    with INT64 weights, while TensorRT does not natively support INT64.
 	//    Attempting to cast down to INT32." and that's not nice.
 	if (providers.count("TensorrtExecutionProvider")) {
 		OrtTensorRTProviderOptionsV2* tensorrt_options = nullptr;
 		Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
 		auto [keys, values] = unpack_options("TensorrtExecutionProvider");
 		if (!keys.empty()) {
 			Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(
 				tensorrt_options, keys.data(), values.data(), keys.size()));
 		}
 		try {
 			options.AppendExecutionProvider_TensorRT_V2(*tensorrt_options);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "TensorRT unavailable: %s\n", e.what());
 		}
 		api.ReleaseTensorRTProviderOptions(tensorrt_options);
 	}
 #endif
 	// See CUDA-ExecutionProvider.html for documentation.
 	if (providers.count("CUDAExecutionProvider")) {
 		OrtCUDAProviderOptionsV2* cuda_options = nullptr;
 		Ort::ThrowOnError(api.CreateCUDAProviderOptions(&cuda_options));
 		auto [keys, values] = unpack_options("CUDAExecutionProvider");
 		if (!keys.empty()) {
 			Ort::ThrowOnError(api.UpdateCUDAProviderOptions(
 				cuda_options, keys.data(), values.data(), keys.size()));
 		}
 		try {
 			options.AppendExecutionProvider_CUDA_V2(*cuda_options);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "CUDA unavailable: %s\n", e.what());
 		}
 		api.ReleaseCUDAProviderOptions(cuda_options);
 	}
 	if (providers.count("ROCMExecutionProvider")) {
 		OrtROCMProviderOptions rocm_options = {};
 		auto [keys, values] = unpack_options("ROCMExecutionProvider");
 		if (!keys.empty()) {
 			Ort::ThrowOnError(api.UpdateROCMProviderOptions(
 				&rocm_options, keys.data(), values.data(), keys.size()));
 		}
 		try {
 			options.AppendExecutionProvider_ROCM(rocm_options);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "ROCM unavailable: %s\n", e.what());
 		}
 	}
 	// The CPU provider is the default fallback, if everything else fails.
 }
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 static std::string
 print_shape(const Ort::ConstTensorTypeAndShapeInfo &info)
 {
 	std::vector<const char *> names(info.GetDimensionsCount());
 	info.GetSymbolicDimensions(names.data(), names.size());
 	auto shape = info.GetShape();
 	std::string result;
 	for (size_t i = 0; i < shape.size(); i++) {
 		if (shape[i] < 0)
 			result.append(names.at(i));
 		else
 			result.append(std::to_string(shape[i]));
 		result.append(" x ");
 	}
 	if (!result.empty())
 		result.erase(result.size() - 3);
 	return result;
 }
 static void
 print_shapes(const Ort::Session &session)
 {
 	Ort::AllocatorWithDefaultOptions allocator;
 	for (size_t i = 0; i < session.GetInputCount(); i++) {
 		std::string name = session.GetInputNameAllocated(i, allocator).get();
 		auto info = session.GetInputTypeInfo(i);
 		auto shape = print_shape(info.GetTensorTypeAndShapeInfo());
 		printf("Input: %s: %s\n", name.c_str(), shape.c_str());
 	}
 	for (size_t i = 0; i < session.GetOutputCount(); i++) {
 		std::string name = session.GetOutputNameAllocated(i, allocator).get();
 		auto info = session.GetOutputTypeInfo(i);
 		auto shape = print_shape(info.GetTensorTypeAndShapeInfo());
 		printf("Output: %s: %s\n", name.c_str(), shape.c_str());
 	}
 }
 static void
 infer(Ort::Env &env, const char *path, const std::vector<std::string> &images)
 {
 	Config config;
 	read_config(config, path);
 	Ort::SessionOptions session_options;
 	add_providers(session_options);
 	Ort::Session session = Ort::Session(env,
 		std::filesystem::path(path).replace_extension("onnx").c_str(),
 		session_options);
 	if (g.debug)
 		print_shapes(session);
 	if (session.GetInputCount() != 1 || session.GetOutputCount() != 1) {
 		fprintf(stderr, "Invalid input or output shape\n");
 		exit(EXIT_FAILURE);
 	}
 	auto input_info = session.GetInputTypeInfo(0);
 	auto shape = input_info.GetTensorTypeAndShapeInfo().GetShape();
 	if (shape.size() != 4) {
 		fprintf(stderr, "Incompatible input tensor format\n");
 		exit(EXIT_FAILURE);
 	}
 	if (shape.at(0) > 1) {
 		fprintf(stderr, "Fixed batching not supported\n");
 		exit(EXIT_FAILURE);
 	}
 	if (shape.at(0) >= 0 && g.batch > 1) {
 		fprintf(stderr, "Requested batching for a non-batching model\n");
 		exit(EXIT_FAILURE);
 	}
 	int64_t *height = {}, *width = {}, *channels = {};
 	switch (config.shape) {
 	case Config::Shape::NCHW:
 		channels = &shape[1];
 		height = &shape[2];
 		width = &shape[3];
 		break;
 	case Config::Shape::NHWC:
 		height = &shape[1];
 		width = &shape[2];
 		channels = &shape[3];
 		break;
 	}
 	// Variable dimensions don't combine well with batches.
 	if (*height < 0)
 		*height = config.size;
 	if (*width < 0)
 		*width = config.size;
 	if (*channels != 3 || *height < 1 || *width < 1) {
 		fprintf(stderr, "Incompatible input tensor format\n");
 		return;
 	}
 	// TODO: Image loading is heavily parallelizable. In theory.
 	std::vector<Magick::Image> batch;
 	for (const auto &filename : images) {
 		Magick::Image image;
 		try {
 			image = load(filename, config, *width, *height);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "%s: %s\n", filename.c_str(), e.what());
 			continue;
 		}
 		if (*height != image.rows() || *width != image.columns()) {
 			fprintf(stderr, "%s: %s\n", filename.c_str(), "tensor mismatch");
 			continue;
 		}
 		batch.push_back(image);
 		if (batch.size() == g.batch) {
 			run(batch, config, session, shape);
 			batch.clear();
 		}
 	}
 	if (!batch.empty())
 		run(batch, config, session, shape);
 }
 int
 main(int argc, char *argv[])
 {
 	auto invocation_name = argv[0];
 	auto print_usage = [=] {
 		fprintf(stderr,
 			"Usage: %s [-b BATCH] [--cpu] [-d] [-o EP;KEY=VALUE...] "
 			"[-t THRESHOLD] MODEL { --pipe | [IMAGE...] }\n", invocation_name);
 	};
 	static option opts[] = {
 		{"batch", required_argument, 0, 'b'},
 		{"cpu", no_argument, 0, 'c'},
 		{"debug", no_argument, 0, 'd'},
 		{"help", no_argument, 0, 'h'},
 		{"options", required_argument, 0, 'o'},
 		{"pipe", no_argument, 0, 'p'},
 		{"threshold", required_argument, 0, 't'},
 		{nullptr, 0, 0, 0},
 	};
 	bool pipe = false;
 	while (1) {
 		int option_index = 0;
 		auto c = getopt_long(argc, const_cast<char *const *>(argv),
 			"b:cdho:pt:", opts, &option_index);
 		if (c == -1)
 			break;
 		char *end = nullptr;
 		switch (c) {
 		case 'b':
 			errno = 0, g.batch = strtol(optarg, &end, 10);
 			if (errno || *end || g.batch < 1 || g.batch > SHRT_MAX) {
 				fprintf(stderr, "Batch size must be a positive number\n");
 				exit(EXIT_FAILURE);
 			}
 			break;
 		case 'c':
 			g.cpu = true;
 			break;
 		case 'd':
 			g.debug++;
 			break;
 		case 'h':
 			print_usage();
 			return 0;
 		case 'o':
 			parse_options(optarg);
 			break;
 		case 'p':
 			pipe = true;
 			break;
 		case 't':
 			errno = 0, g.threshold = strtod(optarg, &end);
 			if (errno || *end || !std::isfinite(g.threshold) ||
 				g.threshold < 0 || g.threshold > 1) {
 				fprintf(stderr, "Threshold must be a number within 0..1\n");
 				exit(EXIT_FAILURE);
 			}
 			break;
 		default:
 			print_usage();
 			return 1;
 		}
 	}
 	argv += optind;
 	argc -= optind;
 	// TODO: There's actually no need to slurp all the lines up front.
 	std::vector<std::string> paths;
 	if (pipe) {
 		if (argc != 1) {
 			print_usage();
 			return 1;
 		}
 		std::string line;
 		while (std::getline(std::cin, line))
 			paths.push_back(line);
 	} else {
 		if (argc < 1) {
 			print_usage();
 			return 1;
 		}
 		paths.assign(argv + 1, argv + argc);
 	}
 	// XXX: GraphicsMagick initializes signal handlers here,
 	// one needs to use MagickLib::InitializeMagickEx()
 	// with MAGICK_OPT_NO_SIGNAL_HANDER to prevent that.
 	//
 	// ImageMagick conveniently has the opposite default.
 	//
 	// Once processing images in parallel, consider presetting
 	// OMP_NUM_THREADS=1 (GM) and/or MAGICK_THREAD_LIMIT=1 (IM).
 	Magick::InitializeMagick(nullptr);
 	OrtLoggingLevel logging = g.debug > 1
 		? ORT_LOGGING_LEVEL_VERBOSE
 		: ORT_LOGGING_LEVEL_WARNING;
 	// Creating an environment before initializing providers in order to avoid:
 	// "Attempt to use DefaultLogger but none has been registered."
 	Ort::Env env(logging, invocation_name);
 	infer(env, argv[0], paths);
 	return 0;
 }
--- a/deeptagger/download.sh
+++ b/deeptagger/download.sh
@@ -0,0 +1,161 @@
 #!/bin/sh -e
 # Requirements: Python ~ 3.11, curl, unzip, git-lfs, awk
 #
 # This script downloads a bunch of models into the models/ directory,
 # after any necessary transformations to run them using the deeptagger binary.
 #
 # Once it succeeds, feel free to remove everything but *.{model,tags,onnx}
 git lfs install
 mkdir -p models
 cd models
 # Create a virtual environment for model conversion.
 #
 # If any of the Python stuff fails,
 # retry from within a Conda environment with a different version of Python.
 export VIRTUAL_ENV=$(pwd)/venv
 export TF_ENABLE_ONEDNN_OPTS=0
 if ! [ -f "$VIRTUAL_ENV/ready" ]
 then
 	python3 -m venv "$VIRTUAL_ENV"
 	#"$VIRTUAL_ENV/bin/pip3" install tensorflow[and-cuda]
 	"$VIRTUAL_ENV/bin/pip3" install tf2onnx 'deepdanbooru[tensorflow]'
 	touch "$VIRTUAL_ENV/ready"
 fi
 status() {
 	echo "$(tput bold)-- $*$(tput sgr0)"
 }
 # Using the deepdanbooru package makes it possible to use other models
 # trained with the project.
 deepdanbooru() {
 	local name=$1 url=$2
 	status "$name"
 	local basename=$(basename "$url")
 	if ! [ -e "$basename" ]
 	then curl -LO "$url"
 	fi
 	local modelname=${basename%%.*}
 	if ! [ -d "$modelname" ]
 	then unzip -d "$modelname" "$basename"
 	fi
 	if ! [ -e "$modelname.tags" ]
 	then ln "$modelname/tags.txt" "$modelname.tags"
 	fi
 	if ! [ -d "$modelname.saved" ]
 	then "$VIRTUAL_ENV/bin/python3" - "$modelname" "$modelname.saved" <<-'END'
 		import sys
 		import deepdanbooru.project as ddp
 		model = ddp.load_model_from_project(
 			project_path=sys.argv[1], compile_model=False)
 		model.export(sys.argv[2])
 	END
 	fi
 	if ! [ -e "$modelname.onnx" ]
 	then "$VIRTUAL_ENV/bin/python3" -m tf2onnx.convert \
 		--saved-model "$modelname.saved" --output "$modelname.onnx"
 	fi
 	cat > "$modelname.model" <<-END
 		name=$name
 		shape=nhwc
 		channels=rgb
 		normalize=true
 		pad=edge
 	END
 }
 # ONNX preconversions don't have a symbolic first dimension, thus doing our own.
 wd14() {
 	local name=$1 repository=$2
 	status "$name"
 	local modelname=$(basename "$repository")
 	if ! [ -d "$modelname" ]
 	then git clone "https://huggingface.co/$repository"
 	fi
 	# Though link the original export as well.
 	if ! [ -e "$modelname.onnx" ]
 	then ln "$modelname/model.onnx" "$modelname.onnx"
 	fi
 	if ! [ -e "$modelname.tags" ]
 	then awk -F, 'NR > 1 { print $2 }' "$modelname/selected_tags.csv" \
 		> "$modelname.tags"
 	fi
 	cat > "$modelname.model" <<-END
 		name=$name
 		shape=nhwc
 		channels=bgr
 		normalize=false
 		pad=white
 	END
 	if ! [ -e "batch-$modelname.onnx" ]
 	then "$VIRTUAL_ENV/bin/python3" -m tf2onnx.convert \
 		--saved-model "$modelname" --output "batch-$modelname.onnx"
 	fi
 	if ! [ -e "batch-$modelname.tags" ]
 	then ln "$modelname.tags" "batch-$modelname.tags"
 	fi
 	if ! [ -e "batch-$modelname.model" ]
 	then ln "$modelname.model" "batch-$modelname.model"
 	fi
 }
 # These models are an undocumented mess, thus using ONNX preconversions.
 mldanbooru() {
 	local name=$1 basename=$2
 	status "$name"
 	if ! [ -d ml-danbooru-onnx ]
 	then git clone https://huggingface.co/deepghs/ml-danbooru-onnx
 	fi
 	local modelname=${basename%%.*}
 	if ! [ -e "$basename" ]
 	then ln "ml-danbooru-onnx/$basename"
 	fi
 	if ! [ -e "$modelname.tags" ]
 	then awk -F, 'NR > 1 { print $1 }' ml-danbooru-onnx/tags.csv \
 		> "$modelname.tags"
 	fi
 	cat > "$modelname.model" <<-END
 		name=$name
 		shape=nchw
 		channels=rgb
 		normalize=true
 		pad=stretch
 		size=640
 		interpret=sigmoid
 	END
 }
 status "Downloading models, beware that git-lfs doesn't indicate progress"
 deepdanbooru DeepDanbooru \
 	'https://github.com/KichangKim/DeepDanbooru/releases/download/v3-20211112-sgd-e28/deepdanbooru-v3-20211112-sgd-e28.zip'
 #wd14 'WD v1.4 ViT v1'        'SmilingWolf/wd-v1-4-vit-tagger'
 wd14 'WD v1.4 ViT v2'        'SmilingWolf/wd-v1-4-vit-tagger-v2'
 #wd14 'WD v1.4 ConvNeXT v1'   'SmilingWolf/wd-v1-4-convnext-tagger'
 wd14 'WD v1.4 ConvNeXT v2'   'SmilingWolf/wd-v1-4-convnext-tagger-v2'
 wd14 'WD v1.4 ConvNeXTV2 v2' 'SmilingWolf/wd-v1-4-convnextv2-tagger-v2'
 wd14 'WD v1.4 SwinV2 v2'     'SmilingWolf/wd-v1-4-swinv2-tagger-v2'
 wd14 'WD v1.4 MOAT v2'       'SmilingWolf/wd-v1-4-moat-tagger-v2'
 # As suggested by author https://github.com/IrisRainbowNeko/ML-Danbooru-webui
 mldanbooru 'ML-Danbooru Caformer dec-5-97527' 'ml_caformer_m36_dec-5-97527.onnx'
 mldanbooru 'ML-Danbooru TResNet-D 6-30000' 'TResnet-D-FLq_ema_6-30000.onnx'