8 changed files with 1 additions and 1156 deletions
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-Copyright (c) 2023 - 2024, Přemysl Eric Janouch <p@janouch.name>
+Copyright (c) 2023, Přemysl Eric Janouch <p@janouch.name>
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted.
--- a/deeptagger/CMakeLists.txt
+++ b/deeptagger/CMakeLists.txt
@ -1,20 +0,0 @@
 # Ubuntu 20.04 LTS
 cmake_minimum_required (VERSION 3.16)
 project (deeptagger VERSION 0.0.1 LANGUAGES CXX)
 # Hint: set ONNXRuntime_ROOT to a directory with a pre-built GitHub release.
 # (Useful for development, otherwise you may need to adjust the rpath.)
 set (CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}")
 find_package (ONNXRuntime REQUIRED)
 find_package (PkgConfig REQUIRED)
 pkg_check_modules (GM REQUIRED GraphicsMagick++)
 add_executable (deeptagger deeptagger.cpp)
 target_compile_features (deeptagger PRIVATE cxx_std_17)
 target_include_directories (deeptagger PRIVATE
 	${GM_INCLUDE_DIRS} ${ONNXRuntime_INCLUDE_DIRS})
 target_link_directories (deeptagger PRIVATE
 	${GM_LIBRARY_DIRS})
 target_link_libraries (deeptagger PRIVATE
 	${GM_LIBRARIES} ${ONNXRuntime_LIBRARIES})
--- a/deeptagger/FindONNXRuntime.cmake
+++ b/deeptagger/FindONNXRuntime.cmake
@ -1,11 +0,0 @@
 # Public Domain
 find_path (ONNXRuntime_INCLUDE_DIRS onnxruntime_c_api.h
 	PATH_SUFFIXES onnxruntime)
 find_library (ONNXRuntime_LIBRARIES NAMES onnxruntime)
 include (FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS (ONNXRuntime DEFAULT_MSG
 	ONNXRuntime_INCLUDE_DIRS ONNXRuntime_LIBRARIES)
 mark_as_advanced (ONNXRuntime_LIBRARIES ONNXRuntime_INCLUDE_DIRS)
--- a/deeptagger/README.adoc
+++ b/deeptagger/README.adoc
@ -1,130 +0,0 @@
 deeptagger
 ==========
 This is an automatic image tagger/classifier written in C++,
 primarily targeting various anime models.
 Unfortunately, you will still need Python 3, as well as some luck, to prepare
 the models, achieved by running download.sh.  You will need about 20 gigabytes
 of space for this operation.
 "WaifuDiffusion v1.4" models are officially distributed with ONNX model exports
 that do not support symbolic batch sizes.  The script attempts to fix this
 by running custom exports.
 You're invited to change things to suit your particular needs.
 Getting it to work
 ------------------
 To build the evaluator, install a C++ compiler, CMake, and development packages
 of GraphicsMagick and ONNX Runtime.
 Prebuilt ONNX Runtime can be most conveniently downloaded from
 https://github.com/microsoft/onnxruntime/releases[GitHub releases].
 Remember to also install CUDA packages, such as _nvidia-cudnn_ on Debian,
 if you plan on using the GPU-enabled options.
 $ cmake -DONNXRuntime_ROOT=/path/to/onnxruntime -B build
 $ cmake --build build
 $ ./download.sh
 $ build/deeptagger models/deepdanbooru-v3-20211112-sgd-e28.model image.jpg
 Very little effort is made to make the project compatible with non-POSIX
 systems.
 Options
 -------
 --batch 1::
 	This program makes use of batches by decoding and preparing multiple images
 	in parallel before sending them off to models.
 	Batching requires appropriate models.
 --cpu::
 	Force CPU inference, which is usually extremely slow.
 --debug::
 	Increase verbosity.
 --options "CUDAExecutionProvider;device_id=0"::
 	Set various ONNX Runtime execution provider options.
 --pipe::
 	Take input filenames from the standard input.
 --threshold 0.1::
 	Output weight threshold.  Needs to be set very high on ML-Danbooru models.
 Model benchmarks
 ----------------
 These were measured on a machine with GeForce RTX 4090 (24G),
 and Ryzen 9 7950X3D (32 threads), on a sample of 704 images,
 which took over eight hours.
 There is room for further performance tuning.
 GPU inference
 ~~~~~~~~~~~~~
 [cols="<,>,>", options=header]
 |===
 |Model|Batch size|Time
 |ML-Danbooru Caformer dec-5-97527|16|OOM
 |WD v1.4 ViT v2 (batch)|16|19 s
 |DeepDanbooru|16|21 s
 |WD v1.4 SwinV2 v2 (batch)|16|21 s
 |WD v1.4 ViT v2 (batch)|4|27 s
 |WD v1.4 SwinV2 v2 (batch)|4|30 s
 |DeepDanbooru|4|31 s
 |ML-Danbooru TResNet-D 6-30000|16|31 s
 |WD v1.4 MOAT v2 (batch)|16|31 s
 |WD v1.4 ConvNeXT v2 (batch)|16|32 s
 |WD v1.4 ConvNeXTV2 v2 (batch)|16|36 s
 |ML-Danbooru TResNet-D 6-30000|4|39 s
 |WD v1.4 ConvNeXT v2 (batch)|4|39 s
 |WD v1.4 MOAT v2 (batch)|4|39 s
 |WD v1.4 ConvNeXTV2 v2 (batch)|4|43 s
 |WD v1.4 ViT v2|1|43 s
 |WD v1.4 ViT v2 (batch)|1|43 s
 |ML-Danbooru Caformer dec-5-97527|4|48 s
 |DeepDanbooru|1|53 s
 |WD v1.4 MOAT v2|1|53 s
 |WD v1.4 ConvNeXT v2|1|54 s
 |WD v1.4 MOAT v2 (batch)|1|54 s
 |WD v1.4 SwinV2 v2|1|54 s
 |WD v1.4 SwinV2 v2 (batch)|1|54 s
 |WD v1.4 ConvNeXT v2 (batch)|1|56 s
 |WD v1.4 ConvNeXTV2 v2|1|56 s
 |ML-Danbooru TResNet-D 6-30000|1|58 s
 |WD v1.4 ConvNeXTV2 v2 (batch)|1|58 s
 |ML-Danbooru Caformer dec-5-97527|1|73 s
 |===
 CPU inference
 ~~~~~~~~~~~~~
 [cols="<,>,>", options=header]
 |===
 |Model|Batch size|Time
 |DeepDanbooru|16|45 s
 |DeepDanbooru|4|54 s
 |DeepDanbooru|1|88 s
 |ML-Danbooru TResNet-D 6-30000|4|139 s
 |ML-Danbooru TResNet-D 6-30000|16|162 s
 |ML-Danbooru TResNet-D 6-30000|1|167 s
 |WD v1.4 ConvNeXT v2|1|208 s
 |WD v1.4 ConvNeXT v2 (batch)|4|226 s
 |WD v1.4 ConvNeXT v2 (batch)|16|238 s
 |WD v1.4 ConvNeXTV2 v2|1|245 s
 |WD v1.4 ConvNeXTV2 v2 (batch)|4|268 s
 |WD v1.4 ViT v2 (batch)|16|270 s
 |WD v1.4 ConvNeXT v2 (batch)|1|272 s
 |WD v1.4 SwinV2 v2 (batch)|4|277 s
 |WD v1.4 ViT v2 (batch)|4|277 s
 |WD v1.4 ConvNeXTV2 v2 (batch)|16|294 s
 |WD v1.4 SwinV2 v2 (batch)|1|300 s
 |WD v1.4 SwinV2 v2|1|302 s
 |WD v1.4 SwinV2 v2 (batch)|16|305 s
 |WD v1.4 MOAT v2 (batch)|4|307 s
 |WD v1.4 ViT v2|1|308 s
 |WD v1.4 ViT v2 (batch)|1|311 s
 |WD v1.4 ConvNeXTV2 v2 (batch)|1|312 s
 |WD v1.4 MOAT v2|1|332 s
 |WD v1.4 MOAT v2 (batch)|16|335 s
 |WD v1.4 MOAT v2 (batch)|1|339 s
 |ML-Danbooru Caformer dec-5-97527|4|637 s
 |ML-Danbooru Caformer dec-5-97527|16|689 s
 |ML-Danbooru Caformer dec-5-97527|1|829 s
 |===
--- a/deeptagger/bench-interpret.sh
+++ b/deeptagger/bench-interpret.sh
@ -1,51 +0,0 @@
 #!/bin/sh -e
 parse() {
 	awk 'BEGIN {
 		OFS = FS = "\t"
 	} {
 		name = $1
 		path = $2
 		cpu = $3 != ""
 		batch = $4
 		time = $5
 		if (path ~ "/batch-")
 			name = name " (batch)"
 		else if (name ~ /^WD / && batch > 1)
 			next
 	} {
 		group = name FS cpu FS batch
 		if (lastgroup != group) {
 			if (lastgroup)
 				print lastgroup, mintime
 			lastgroup = group
 			mintime = time
 		} else {
 			if (mintime > time)
 				mintime = time
 		}
 	} END {
 		print lastgroup, mintime
 	}' "${BENCH_LOG:-bench.out}"
 }
 cat <<END
 GPU inference
 ~~~~~~~~~~~~~
 [cols="<,>,>", options=header]
 |===
 |Model|Batch size|Time
 $(parse | awk -F'\t' 'BEGIN { OFS = "|" }
 	!$2 { print "", $1, $3, $4 " s" }' | sort -t'|' -nk4)
 |===
 CPU inference
 ~~~~~~~~~~~~~
 [cols="<,>,>", options=header]
 |===
 |Model|Batch size|Time
 $(parse | awk -F'\t' 'BEGIN { OFS = "|" }
 	$2 { print "", $1, $3, $4 " s" }' | sort -t'|' -nk4)
 |===
 END
--- a/deeptagger/bench.sh
+++ b/deeptagger/bench.sh
@ -1,38 +0,0 @@
 #!/bin/sh -e
 if [ $# -lt 2 ] || ! [ -x "$1" ]
 then
 	echo "Usage: $0 DEEPTAGGER FILE..."
 	echo "Run this after using download.sh, from the same directory."
 	exit 1
 fi
 runner=$1
 shift
 log=bench.out
 : >$log
 run() {
 	opts=$1 batch=$2 model=$3
 	shift 3
 	for i in $(seq 1 3)
 	do
 		start=$(date +%s)
 		"$runner" $opts -b "$batch" -t 0.75 "$model" "$@" >/dev/null || :
 		end=$(date +%s)
 		printf '%s\t%s\t%s\t%s\t%s\n' \
 			"$name" "$model" "$opts" "$batch" "$((end - start))" | tee -a $log
 	done
 }
 for model in models/*.model
 do
 	name=$(sed -n 's/^name=//p' "$model")
 	run ""     1 "$model" "$@"
 	run ""     4 "$model" "$@"
 	run ""    16 "$model" "$@"
 	run --cpu  1 "$model" "$@"
 	run --cpu  4 "$model" "$@"
 	run --cpu 16 "$model" "$@"
 done
--- a/deeptagger/deeptagger.cpp
+++ b/deeptagger/deeptagger.cpp
@ -1,744 +0,0 @@
 #include <getopt.h>
 #include <Magick++.h>
 #include <onnxruntime_cxx_api.h>
 #ifdef __APPLE__
 #include <coreml_provider_factory.h>
 #endif
 #include <algorithm>
 #include <condition_variable>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <mutex>
 #include <queue>
 #include <regex>
 #include <set>
 #include <stdexcept>
 #include <string>
 #include <thread>
 #include <tuple>
 #include <cstdio>
 #include <cstdint>
 #include <climits>
 static struct {
 	bool cpu = false;
 	int debug = 0;
 	long batch = 1;
 	float threshold = 0.1;
 	// Execution provider name → Key → Value
 	std::map<std::string, std::map<std::string, std::string>> options;
 } g;
 // --- Configuration -----------------------------------------------------------
 // Arguably, input normalization could be incorporated into models instead.
 struct Config {
 	std::string name;
 	enum class Shape {NHWC, NCHW} shape = Shape::NHWC;
 	enum class Channels {RGB, BGR} channels = Channels::RGB;
 	bool normalize = false;
 	enum class Pad {WHITE, EDGE, STRETCH} pad = Pad::WHITE;
 	int size = -1;
 	bool sigmoid = false;
 	std::vector<std::string> tags;
 };
 static void
 read_tags(const std::string &path, std::vector<std::string> &tags)
 {
 	std::ifstream f(path);
 	f.exceptions(std::ifstream::badbit);
 	if (!f)
 		throw std::runtime_error("cannot read tags");
 	std::string line;
 	while (std::getline(f, line)) {
 		if (!line.empty() && line.back() == '\r')
 			line.erase(line.size() - 1);
 		tags.push_back(line);
 	}
 }
 static void
 read_field(Config &config, std::string key, std::string value)
 {
 	if (key == "name") {
 		config.name = value;
 	} else if (key == "shape") {
 		if      (value == "nhwc")    config.shape = Config::Shape::NHWC;
 		else if (value == "nchw")    config.shape = Config::Shape::NCHW;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "channels") {
 		if      (value == "rgb")     config.channels = Config::Channels::RGB;
 		else if (value == "bgr")     config.channels = Config::Channels::BGR;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "normalize") {
 		if      (value == "true")    config.normalize = true;
 		else if (value == "false")   config.normalize = false;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "pad") {
 		if      (value == "white")   config.pad = Config::Pad::WHITE;
 		else if (value == "edge")    config.pad = Config::Pad::EDGE;
 		else if (value == "stretch") config.pad = Config::Pad::STRETCH;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else if (key == "size") {
 		config.size = std::stoi(value);
 	} else if (key == "interpret") {
 		if      (value == "false")   config.sigmoid = false;
 		else if (value == "sigmoid") config.sigmoid = true;
 		else throw std::invalid_argument("bad value for: " + key);
 	} else {
 		throw std::invalid_argument("unsupported config key: " + key);
 	}
 }
 static void
 read_config(Config &config, const char *path)
 {
 	std::ifstream f(path);
 	f.exceptions(std::ifstream::badbit);
 	if (!f)
 		throw std::runtime_error("cannot read configuration");
 	std::regex re(R"(^\s*([^#=]+?)\s*=\s*([^#]*?)\s*(?:#|$))",
 		std::regex::optimize);
 	std::smatch m;
 	std::string line;
 	while (std::getline(f, line)) {
 		if (std::regex_match(line, m, re))
 			read_field(config, m[1].str(), m[2].str());
 	}
 	read_tags(
 		std::filesystem::path(path).replace_extension("tags"), config.tags);
 }
 // --- Data preparation --------------------------------------------------------
 static float *
 image_to_nhwc(float *data, Magick::Image &image, Config::Channels channels)
 {
 	unsigned int width = image.columns();
 	unsigned int height = image.rows();
 	auto pixels = image.getConstPixels(0, 0, width, height);
 	switch (channels) {
 	case Config::Channels::RGB:
 		for (unsigned int y = 0; y < height; y++) {
 			for (unsigned int x = 0; x < width; x++) {
 				auto pixel = *pixels++;
 				*data++ = ScaleQuantumToChar(pixel.red);
 				*data++ = ScaleQuantumToChar(pixel.green);
 				*data++ = ScaleQuantumToChar(pixel.blue);
 			}
 		}
 		break;
 	case Config::Channels::BGR:
 		for (unsigned int y = 0; y < height; y++) {
 			for (unsigned int x = 0; x < width; x++) {
 				auto pixel = *pixels++;
 				*data++ = ScaleQuantumToChar(pixel.blue);
 				*data++ = ScaleQuantumToChar(pixel.green);
 				*data++ = ScaleQuantumToChar(pixel.red);
 			}
 		}
 	}
 	return data;
 }
 static float *
 image_to_nchw(float *data, Magick::Image &image, Config::Channels channels)
 {
 	unsigned int width = image.columns();
 	unsigned int height = image.rows();
 	auto pixels = image.getConstPixels(0, 0, width, height), pp = pixels;
 	switch (channels) {
 	case Config::Channels::RGB:
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).red);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).green);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).blue);
 		break;
 	case Config::Channels::BGR:
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).blue);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).green);
 		pp = pixels;
 		for (unsigned int y = 0; y < height; y++)
 			for (unsigned int x = 0; x < width; x++)
 				*data++ = ScaleQuantumToChar((*pp++).red);
 	}
 	return data;
 }
 static Magick::Image
 load(const std::string filename,
 	const Config &config, int64_t width, int64_t height)
 {
 	Magick::Image image;
 	try {
 		image.read(filename);
 	} catch (const Magick::Warning &warning) {
 		if (g.debug)
 			fprintf(stderr, "%s: %s\n", filename.c_str(), warning.what());
 	}
 	image.autoOrient();
 	Magick::Geometry adjusted(width, height);
 	switch (config.pad) {
 	case Config::Pad::EDGE:
 	case Config::Pad::WHITE:
 		adjusted.greater(true);
 		break;
 	case Config::Pad::STRETCH:
 		adjusted.aspect(false);
 	}
 	image.resize(adjusted, Magick::LanczosFilter);
 	// The GraphicsMagick API doesn't offer any good options.
 	if (config.pad == Config::Pad::EDGE) {
 		MagickLib::SetImageVirtualPixelMethod(
 			image.image(), MagickLib::EdgeVirtualPixelMethod);
 		auto x = (int64_t(image.columns()) - width) / 2;
 		auto y = (int64_t(image.rows()) - height) / 2;
 		auto source = image.getConstPixels(x, y, width, height);
 		std::vector<MagickLib::PixelPacket>
 			pixels(source, source + width * height);
 		Magick::Image edged(Magick::Geometry(width, height), "black");
 		edged.classType(Magick::DirectClass);
 		auto target = edged.setPixels(0, 0, width, height);
 		memcpy(target, pixels.data(), pixels.size() * sizeof pixels[0]);
 		edged.syncPixels();
 		image = edged;
 	}
 	// Center it in a square patch of white, removing any transparency.
 	// image.extent() could probably be used to do the same thing.
 	Magick::Image white(Magick::Geometry(width, height), "white");
 	auto x = (white.columns() - image.columns()) / 2;
 	auto y = (white.rows() - image.rows()) / 2;
 	white.composite(image, x, y, Magick::OverCompositeOp);
 	white.fileName(filename);
 	if (g.debug > 2)
 		white.display();
 	return white;
 }
 // --- Inference ---------------------------------------------------------------
 static void
 run(std::vector<Magick::Image> &images, const Config &config,
 	Ort::Session &session, std::vector<int64_t> shape)
 {
 	// For consistency, this value may be bumped to always be g.batch,
 	// but it does not seem to have an effect on anything.
 	shape[0] = images.size();
 	Ort::AllocatorWithDefaultOptions allocator;
 	auto tensor = Ort::Value::CreateTensor<float>(
 		allocator, shape.data(), shape.size());
 	auto input_len = tensor.GetTensorTypeAndShapeInfo().GetElementCount();
 	auto input_data = tensor.GetTensorMutableData<float>(), pi = input_data;
 	for (int64_t i = 0; i < images.size(); i++) {
 		switch (config.shape) {
 		case Config::Shape::NCHW:
 			pi = image_to_nchw(pi, images.at(i), config.channels);
 			break;
 		case Config::Shape::NHWC:
 			pi = image_to_nhwc(pi, images.at(i), config.channels);
 		}
 	}
 	if (config.normalize) {
 		pi = input_data;
 		for (size_t i = 0; i < input_len; i++)
 			*pi++ /= 255.0;
 	}
 	std::string input_name =
 		session.GetInputNameAllocated(0, allocator).get();
 	std::string output_name =
 		session.GetOutputNameAllocated(0, allocator).get();
 	std::vector<const char *> input_names = {input_name.c_str()};
 	std::vector<const char *> output_names = {output_name.c_str()};
 	auto outputs = session.Run(Ort::RunOptions{},
 		input_names.data(), &tensor, input_names.size(),
 		output_names.data(), output_names.size());
 	if (outputs.size() != 1 || !outputs[0].IsTensor()) {
 		fprintf(stderr, "Wrong output\n");
 		return;
 	}
 	auto output_len = outputs[0].GetTensorTypeAndShapeInfo().GetElementCount();
 	auto output_data = outputs.front().GetTensorData<float>(), po = output_data;
 	if (output_len != shape[0] * config.tags.size()) {
 		fprintf(stderr, "Tags don't match the output\n");
 		return;
 	}
 	for (size_t i = 0; i < images.size(); i++) {
 		for (size_t t = 0; t < config.tags.size(); t++) {
 			float value = *po++;
 			if (config.sigmoid)
 				value = 1 / (1 + std::exp(-value));
 			if (value > g.threshold) {
 				printf("%s\t%.2f\t%s\n", images.at(i).fileName().c_str(),
 					value, config.tags.at(t).c_str());
 			}
 		}
 	}
 }
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 static void
 parse_options(const std::string &options)
 {
 	auto semicolon = options.find(";");
 	auto name = options.substr(0, semicolon);
 	auto sequence = options.substr(semicolon);
 	std::map<std::string, std::string> kv;
 	std::regex re(R"(;*([^;=]+)=([^;=]+))", std::regex::optimize);
 	std::sregex_iterator it(sequence.begin(), sequence.end(), re), end;
 	for (; it != end; ++it)
 		kv[it->str(1)] = it->str(2);
 	g.options.insert_or_assign(name, std::move(kv));
 }
 static std::tuple<std::vector<const char *>, std::vector<const char *>>
 unpack_options(const std::string &provider)
 {
 	std::vector<const char *> keys, values;
 	if (g.options.count(provider)) {
 		for (const auto &kv : g.options.at(provider)) {
 			keys.push_back(kv.first.c_str());
 			values.push_back(kv.second.c_str());
 		}
 	}
 	return {keys, values};
 }
 static void
 add_providers(Ort::SessionOptions &options)
 {
 	auto api = Ort::GetApi();
 	auto v_providers = Ort::GetAvailableProviders();
 	std::set<std::string> providers(v_providers.begin(), v_providers.end());
 	if (g.debug) {
 		printf("Providers:");
 		for (const auto &it : providers)
 			printf(" %s", it.c_str());
 		printf("\n");
 	}
 	// There is a string-based AppendExecutionProvider() method,
 	// but it cannot be used with all providers.
 	// TODO: Make it possible to disable providers.
 	// TODO: Providers will deserve some performance tuning.
 	if (g.cpu)
 		return;
 #ifdef __APPLE__
 	if (providers.count("CoreMLExecutionProvider")) {
 		try {
 			Ort::ThrowOnError(
 				OrtSessionOptionsAppendExecutionProvider_CoreML(options, 0));
 		} catch (const std::exception &e) {
 			fprintf(stderr, "CoreML unavailable: %s\n", e.what());
 		}
 	}
 #endif
 #if TENSORRT
 	// TensorRT should be the more performant execution provider, however:
 	//  - it is difficult to set up (needs logging in to download),
 	//  - with WD v1.4 ONNX models, one gets "Your ONNX model has been generated
 	//    with INT64 weights, while TensorRT does not natively support INT64.
 	//    Attempting to cast down to INT32." and that's not nice.
 	if (providers.count("TensorrtExecutionProvider")) {
 		OrtTensorRTProviderOptionsV2* tensorrt_options = nullptr;
 		Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
 		auto [keys, values] = unpack_options("TensorrtExecutionProvider");
 		if (!keys.empty()) {
 			Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(
 				tensorrt_options, keys.data(), values.data(), keys.size()));
 		}
 		try {
 			options.AppendExecutionProvider_TensorRT_V2(*tensorrt_options);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "TensorRT unavailable: %s\n", e.what());
 		}
 		api.ReleaseTensorRTProviderOptions(tensorrt_options);
 	}
 #endif
 	// See CUDA-ExecutionProvider.html for documentation.
 	if (providers.count("CUDAExecutionProvider")) {
 		OrtCUDAProviderOptionsV2* cuda_options = nullptr;
 		Ort::ThrowOnError(api.CreateCUDAProviderOptions(&cuda_options));
 		auto [keys, values] = unpack_options("CUDAExecutionProvider");
 		if (!keys.empty()) {
 			Ort::ThrowOnError(api.UpdateCUDAProviderOptions(
 				cuda_options, keys.data(), values.data(), keys.size()));
 		}
 		try {
 			options.AppendExecutionProvider_CUDA_V2(*cuda_options);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "CUDA unavailable: %s\n", e.what());
 		}
 		api.ReleaseCUDAProviderOptions(cuda_options);
 	}
 	if (providers.count("ROCMExecutionProvider")) {
 		OrtROCMProviderOptions rocm_options = {};
 		auto [keys, values] = unpack_options("ROCMExecutionProvider");
 		if (!keys.empty()) {
 			Ort::ThrowOnError(api.UpdateROCMProviderOptions(
 				&rocm_options, keys.data(), values.data(), keys.size()));
 		}
 		try {
 			options.AppendExecutionProvider_ROCM(rocm_options);
 		} catch (const std::exception &e) {
 			fprintf(stderr, "ROCM unavailable: %s\n", e.what());
 		}
 	}
 	// The CPU provider is the default fallback, if everything else fails.
 }
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 struct Thumbnailing {
 	std::mutex input_mutex;
 	std::condition_variable input_cv;
 	std::queue<std::string> input;      // All input paths
 	int work = 0;                       // Number of images requested
 	std::mutex output_mutex;
 	std::condition_variable output_cv;
 	std::vector<Magick::Image> output;  // Processed images
 	int done = 0;                       // Finished worker threads
 };
 static void
 thumbnail(const Config &config, int64_t width, int64_t height,
 	Thumbnailing &ctx)
 {
 	while (true) {
 		std::unique_lock<std::mutex> input_lock(ctx.input_mutex);
 		ctx.input_cv.wait(input_lock,
 			[&]{ return ctx.input.empty() || ctx.work; });
 		if (ctx.input.empty())
 			break;
 		auto path = ctx.input.front();
 		ctx.input.pop();
 		ctx.work--;
 		input_lock.unlock();
 		Magick::Image image;
 		try {
 			image = load(path, config, width, height);
 			if (height != image.rows() || width != image.columns())
 				throw std::runtime_error("tensor mismatch");
 			std::unique_lock<std::mutex> output_lock(ctx.output_mutex);
 			ctx.output.push_back(image);
 			output_lock.unlock();
 			ctx.output_cv.notify_all();
 		} catch (const std::exception &e) {
 			fprintf(stderr, "%s: %s\n", path.c_str(), e.what());
 			std::unique_lock<std::mutex> input_lock(ctx.input_mutex);
 			ctx.work++;
 			input_lock.unlock();
 			ctx.input_cv.notify_all();
 		}
 	}
 	std::unique_lock<std::mutex> output_lock(ctx.output_mutex);
 	ctx.done++;
 	output_lock.unlock();
 	ctx.output_cv.notify_all();
 }
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 static std::string
 print_shape(const Ort::ConstTensorTypeAndShapeInfo &info)
 {
 	std::vector<const char *> names(info.GetDimensionsCount());
 	info.GetSymbolicDimensions(names.data(), names.size());
 	auto shape = info.GetShape();
 	std::string result;
 	for (size_t i = 0; i < shape.size(); i++) {
 		if (shape[i] < 0)
 			result.append(names.at(i));
 		else
 			result.append(std::to_string(shape[i]));
 		result.append(" x ");
 	}
 	if (!result.empty())
 		result.erase(result.size() - 3);
 	return result;
 }
 static void
 print_shapes(const Ort::Session &session)
 {
 	Ort::AllocatorWithDefaultOptions allocator;
 	for (size_t i = 0; i < session.GetInputCount(); i++) {
 		std::string name = session.GetInputNameAllocated(i, allocator).get();
 		auto info = session.GetInputTypeInfo(i);
 		auto shape = print_shape(info.GetTensorTypeAndShapeInfo());
 		printf("Input: %s: %s\n", name.c_str(), shape.c_str());
 	}
 	for (size_t i = 0; i < session.GetOutputCount(); i++) {
 		std::string name = session.GetOutputNameAllocated(i, allocator).get();
 		auto info = session.GetOutputTypeInfo(i);
 		auto shape = print_shape(info.GetTensorTypeAndShapeInfo());
 		printf("Output: %s: %s\n", name.c_str(), shape.c_str());
 	}
 }
 static void
 infer(Ort::Env &env, const char *path, const std::vector<std::string> &images)
 {
 	Config config;
 	read_config(config, path);
 	Ort::SessionOptions session_options;
 	add_providers(session_options);
 	Ort::Session session = Ort::Session(env,
 		std::filesystem::path(path).replace_extension("onnx").c_str(),
 		session_options);
 	if (g.debug)
 		print_shapes(session);
 	if (session.GetInputCount() != 1 || session.GetOutputCount() != 1) {
 		fprintf(stderr, "Invalid input or output shape\n");
 		exit(EXIT_FAILURE);
 	}
 	auto input_info = session.GetInputTypeInfo(0);
 	auto shape = input_info.GetTensorTypeAndShapeInfo().GetShape();
 	if (shape.size() != 4) {
 		fprintf(stderr, "Incompatible input tensor format\n");
 		exit(EXIT_FAILURE);
 	}
 	if (shape.at(0) > 1) {
 		fprintf(stderr, "Fixed batching not supported\n");
 		exit(EXIT_FAILURE);
 	}
 	if (shape.at(0) >= 0 && g.batch > 1) {
 		fprintf(stderr, "Requested batching for a non-batching model\n");
 		exit(EXIT_FAILURE);
 	}
 	int64_t *height = {}, *width = {}, *channels = {};
 	switch (config.shape) {
 	case Config::Shape::NCHW:
 		channels = &shape[1];
 		height = &shape[2];
 		width = &shape[3];
 		break;
 	case Config::Shape::NHWC:
 		height = &shape[1];
 		width = &shape[2];
 		channels = &shape[3];
 		break;
 	}
 	// Variable dimensions don't combine well with batches.
 	if (*height < 0)
 		*height = config.size;
 	if (*width < 0)
 		*width = config.size;
 	if (*channels != 3 || *height < 1 || *width < 1) {
 		fprintf(stderr, "Incompatible input tensor format\n");
 		return;
 	}
 	// By only parallelizing image loads here during batching,
 	// they never compete for CPU time with inference.
 	Thumbnailing ctx;
 	for (const auto &path : images)
 		ctx.input.push(path);
 	auto workers = g.batch;
 	if (auto threads = std::thread::hardware_concurrency())
 		workers = std::min(workers, long(threads));
 	for (auto i = workers; i--; )
 		std::thread(thumbnail, std::ref(config), *width, *height,
 			std::ref(ctx)).detach();
 	while (true) {
 		std::unique_lock<std::mutex> input_lock(ctx.input_mutex);
 		ctx.work = g.batch;
 		input_lock.unlock();
 		ctx.input_cv.notify_all();
 		std::unique_lock<std::mutex> output_lock(ctx.output_mutex);
 		ctx.output_cv.wait(output_lock,
 			[&]{ return ctx.output.size() == g.batch || ctx.done == workers; });
 		if (!ctx.output.empty()) {
 			run(ctx.output, config, session, shape);
 			ctx.output.clear();
 		}
 		if (ctx.done == workers)
 			break;
 	}
 }
 int
 main(int argc, char *argv[])
 {
 	auto invocation_name = argv[0];
 	auto print_usage = [=] {
 		fprintf(stderr,
 			"Usage: %s [-b BATCH] [--cpu] [-d] [-o EP;KEY=VALUE...] "
 			"[-t THRESHOLD] MODEL { --pipe | [IMAGE...] }\n", invocation_name);
 	};
 	static option opts[] = {
 		{"batch", required_argument, 0, 'b'},
 		{"cpu", no_argument, 0, 'c'},
 		{"debug", no_argument, 0, 'd'},
 		{"help", no_argument, 0, 'h'},
 		{"options", required_argument, 0, 'o'},
 		{"pipe", no_argument, 0, 'p'},
 		{"threshold", required_argument, 0, 't'},
 		{nullptr, 0, 0, 0},
 	};
 	bool pipe = false;
 	while (1) {
 		int option_index = 0;
 		auto c = getopt_long(argc, const_cast<char *const *>(argv),
 			"b:cdho:pt:", opts, &option_index);
 		if (c == -1)
 			break;
 		char *end = nullptr;
 		switch (c) {
 		case 'b':
 			errno = 0, g.batch = strtol(optarg, &end, 10);
 			if (errno || *end || g.batch < 1 || g.batch > SHRT_MAX) {
 				fprintf(stderr, "Batch size must be a positive number\n");
 				exit(EXIT_FAILURE);
 			}
 			break;
 		case 'c':
 			g.cpu = true;
 			break;
 		case 'd':
 			g.debug++;
 			break;
 		case 'h':
 			print_usage();
 			return 0;
 		case 'o':
 			parse_options(optarg);
 			break;
 		case 'p':
 			pipe = true;
 			break;
 		case 't':
 			errno = 0, g.threshold = strtod(optarg, &end);
 			if (errno || *end || !std::isfinite(g.threshold) ||
 				g.threshold < 0 || g.threshold > 1) {
 				fprintf(stderr, "Threshold must be a number within 0..1\n");
 				exit(EXIT_FAILURE);
 			}
 			break;
 		default:
 			print_usage();
 			return 1;
 		}
 	}
 	argv += optind;
 	argc -= optind;
 	// TODO: There's actually no need to slurp all the lines up front.
 	std::vector<std::string> paths;
 	if (pipe) {
 		if (argc != 1) {
 			print_usage();
 			return 1;
 		}
 		std::string line;
 		while (std::getline(std::cin, line))
 			paths.push_back(line);
 	} else {
 		if (argc < 1) {
 			print_usage();
 			return 1;
 		}
 		paths.assign(argv + 1, argv + argc);
 	}
 	// Load batched images in parallel (the first is for GM, the other for IM).
 	if (g.batch > 1) {
 		auto value = std::to_string(
 			std::max(std::thread::hardware_concurrency() / g.batch, 1L));
 		setenv("OMP_NUM_THREADS", value.c_str(), true);
 		setenv("MAGICK_THREAD_LIMIT", value.c_str(), true);
 	}
 	// XXX: GraphicsMagick initializes signal handlers here,
 	// one needs to use MagickLib::InitializeMagickEx()
 	// with MAGICK_OPT_NO_SIGNAL_HANDER to prevent that.
 	//
 	// ImageMagick conveniently has the opposite default.
 	Magick::InitializeMagick(nullptr);
 	OrtLoggingLevel logging = g.debug > 1
 		? ORT_LOGGING_LEVEL_VERBOSE
 		: ORT_LOGGING_LEVEL_WARNING;
 	// Creating an environment before initializing providers in order to avoid:
 	// "Attempt to use DefaultLogger but none has been registered."
 	Ort::Env env(logging, invocation_name);
 	infer(env, argv[0], paths);
 	return 0;
 }
--- a/deeptagger/download.sh
+++ b/deeptagger/download.sh
@ -1,161 +0,0 @@
 #!/bin/sh -e
 # Requirements: Python ~ 3.11, curl, unzip, git-lfs, awk
 #
 # This script downloads a bunch of models into the models/ directory,
 # after any necessary transformations to run them using the deeptagger binary.
 #
 # Once it succeeds, feel free to remove everything but *.{model,tags,onnx}
 git lfs install
 mkdir -p models
 cd models
 # Create a virtual environment for model conversion.
 #
 # If any of the Python stuff fails,
 # retry from within a Conda environment with a different version of Python.
 export VIRTUAL_ENV=$(pwd)/venv
 export TF_ENABLE_ONEDNN_OPTS=0
 if ! [ -f "$VIRTUAL_ENV/ready" ]
 then
 	python3 -m venv "$VIRTUAL_ENV"
 	#"$VIRTUAL_ENV/bin/pip3" install tensorflow[and-cuda]
 	"$VIRTUAL_ENV/bin/pip3" install tf2onnx 'deepdanbooru[tensorflow]'
 	touch "$VIRTUAL_ENV/ready"
 fi
 status() {
 	echo "$(tput bold)-- $*$(tput sgr0)"
 }
 # Using the deepdanbooru package makes it possible to use other models
 # trained with the project.
 deepdanbooru() {
 	local name=$1 url=$2
 	status "$name"
 	local basename=$(basename "$url")
 	if ! [ -e "$basename" ]
 	then curl -LO "$url"
 	fi
 	local modelname=${basename%%.*}
 	if ! [ -d "$modelname" ]
 	then unzip -d "$modelname" "$basename"
 	fi
 	if ! [ -e "$modelname.tags" ]
 	then ln "$modelname/tags.txt" "$modelname.tags"
 	fi
 	if ! [ -d "$modelname.saved" ]
 	then "$VIRTUAL_ENV/bin/python3" - "$modelname" "$modelname.saved" <<-'END'
 		import sys
 		import deepdanbooru.project as ddp
 		model = ddp.load_model_from_project(
 			project_path=sys.argv[1], compile_model=False)
 		model.export(sys.argv[2])
 	END
 	fi
 	if ! [ -e "$modelname.onnx" ]
 	then "$VIRTUAL_ENV/bin/python3" -m tf2onnx.convert \
 		--saved-model "$modelname.saved" --output "$modelname.onnx"
 	fi
 	cat > "$modelname.model" <<-END
 		name=$name
 		shape=nhwc
 		channels=rgb
 		normalize=true
 		pad=edge
 	END
 }
 # ONNX preconversions don't have a symbolic first dimension, thus doing our own.
 wd14() {
 	local name=$1 repository=$2
 	status "$name"
 	local modelname=$(basename "$repository")
 	if ! [ -d "$modelname" ]
 	then git clone "https://huggingface.co/$repository"
 	fi
 	# Though link the original export as well.
 	if ! [ -e "$modelname.onnx" ]
 	then ln "$modelname/model.onnx" "$modelname.onnx"
 	fi
 	if ! [ -e "$modelname.tags" ]
 	then awk -F, 'NR > 1 { print $2 }' "$modelname/selected_tags.csv" \
 		> "$modelname.tags"
 	fi
 	cat > "$modelname.model" <<-END
 		name=$name
 		shape=nhwc
 		channels=bgr
 		normalize=false
 		pad=white
 	END
 	if ! [ -e "batch-$modelname.onnx" ]
 	then "$VIRTUAL_ENV/bin/python3" -m tf2onnx.convert \
 		--saved-model "$modelname" --output "batch-$modelname.onnx"
 	fi
 	if ! [ -e "batch-$modelname.tags" ]
 	then ln "$modelname.tags" "batch-$modelname.tags"
 	fi
 	if ! [ -e "batch-$modelname.model" ]
 	then ln "$modelname.model" "batch-$modelname.model"
 	fi
 }
 # These models are an undocumented mess, thus using ONNX preconversions.
 mldanbooru() {
 	local name=$1 basename=$2
 	status "$name"
 	if ! [ -d ml-danbooru-onnx ]
 	then git clone https://huggingface.co/deepghs/ml-danbooru-onnx
 	fi
 	local modelname=${basename%%.*}
 	if ! [ -e "$basename" ]
 	then ln "ml-danbooru-onnx/$basename"
 	fi
 	if ! [ -e "$modelname.tags" ]
 	then awk -F, 'NR > 1 { print $1 }' ml-danbooru-onnx/tags.csv \
 		> "$modelname.tags"
 	fi
 	cat > "$modelname.model" <<-END
 		name=$name
 		shape=nchw
 		channels=rgb
 		normalize=true
 		pad=stretch
 		size=640
 		interpret=sigmoid
 	END
 }
 status "Downloading models, beware that git-lfs doesn't indicate progress"
 deepdanbooru DeepDanbooru \
 	'https://github.com/KichangKim/DeepDanbooru/releases/download/v3-20211112-sgd-e28/deepdanbooru-v3-20211112-sgd-e28.zip'
 #wd14 'WD v1.4 ViT v1'        'SmilingWolf/wd-v1-4-vit-tagger'
 wd14 'WD v1.4 ViT v2'        'SmilingWolf/wd-v1-4-vit-tagger-v2'
 #wd14 'WD v1.4 ConvNeXT v1'   'SmilingWolf/wd-v1-4-convnext-tagger'
 wd14 'WD v1.4 ConvNeXT v2'   'SmilingWolf/wd-v1-4-convnext-tagger-v2'
 wd14 'WD v1.4 ConvNeXTV2 v2' 'SmilingWolf/wd-v1-4-convnextv2-tagger-v2'
 wd14 'WD v1.4 SwinV2 v2'     'SmilingWolf/wd-v1-4-swinv2-tagger-v2'
 wd14 'WD v1.4 MOAT v2'       'SmilingWolf/wd-v1-4-moat-tagger-v2'
 # As suggested by author https://github.com/IrisRainbowNeko/ML-Danbooru-webui
 mldanbooru 'ML-Danbooru Caformer dec-5-97527' 'ml_caformer_m36_dec-5-97527.onnx'
 mldanbooru 'ML-Danbooru TResNet-D 6-30000' 'TResnet-D-FLq_ema_6-30000.onnx'