Load images in multiple threads

This worsens CPU-only times by some five percent, but can also make GPU-accelerated runtime twice as fast.
2024-01-18 00:54:40 +01:00
parent b4f28814b7
commit 36f6612603
1 changed files with 92 additions and 23 deletions
--- a/deeptagger/deeptagger.cpp
+++ b/deeptagger/deeptagger.cpp
@@ -6,13 +6,17 @@
 #endif
 #include <algorithm>
 #include <condition_variable>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <mutex>
 #include <queue>
 #include <regex>
 #include <set>
 #include <stdexcept>
 #include <string>
 #include <thread>
 #include <tuple>
 #include <cstdio>
@@ -435,6 +439,62 @@ add_providers(Ort::SessionOptions &options)
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 struct Thumbnailing {
 	std::mutex input_mutex;
 	std::condition_variable input_cv;
 	std::queue<std::string> input;      // All input paths
 	int work = 0;                       // Number of images requested
 	std::mutex output_mutex;
 	std::condition_variable output_cv;
 	std::vector<Magick::Image> output;  // Processed images
 	int done = 0;                       // Finished worker threads
 };
 static void
 thumbnail(const Config &config, int64_t width, int64_t height,
 	Thumbnailing &ctx)
 {
 	while (true) {
 		std::unique_lock<std::mutex> input_lock(ctx.input_mutex);
 		ctx.input_cv.wait(input_lock,
 			[&]{ return ctx.input.empty() || ctx.work; });
 		if (ctx.input.empty())
 			break;
 		auto path = ctx.input.front();
 		ctx.input.pop();
 		ctx.work--;
 		input_lock.unlock();
 		Magick::Image image;
 		try {
 			image = load(path, config, width, height);
 			if (height != image.rows() || width != image.columns())
 				throw std::runtime_error("tensor mismatch");
 			std::unique_lock<std::mutex> output_lock(ctx.output_mutex);
 			ctx.output.push_back(image);
 			output_lock.unlock();
 			ctx.output_cv.notify_all();
 		} catch (const std::exception &e) {
 			fprintf(stderr, "%s: %s\n", path.c_str(), e.what());
 			std::unique_lock<std::mutex> input_lock(ctx.input_mutex);
 			ctx.work++;
 			input_lock.unlock();
 			ctx.input_cv.notify_all();
 		}
 	}
 	std::unique_lock<std::mutex> output_lock(ctx.output_mutex);
 	ctx.done++;
 	output_lock.unlock();
 	ctx.output_cv.notify_all();
 }
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 static std::string
 print_shape(const Ort::ConstTensorTypeAndShapeInfo &info)
 {
@@ -533,30 +593,34 @@ infer(Ort::Env &env, const char *path, const std::vector<std::string> &images)
 		return;
 	}
-	// TODO: Image loading is heavily parallelizable. In theory.
+	// By only parallelizing image loads here during batching,
-	std::vector<Magick::Image> batch;
+	// they never compete for CPU time with inference.
-	for (const auto &filename : images) {
+	Thumbnailing ctx;
-		Magick::Image image;
+	for (const auto &path : images)
-		try {
+		ctx.input.push(path);
-			image = load(filename, config, *width, *height);
+	for (auto i = g.batch; i--; )
-		} catch (const std::exception &e) {
+		std::thread(thumbnail, std::ref(config), *width, *height,
-			fprintf(stderr, "%s: %s\n", filename.c_str(), e.what());
+			std::ref(ctx)).detach();
 			continue;
 		}
-		if (*height != image.rows() || *width != image.columns()) {
+	while (true) {
-			fprintf(stderr, "%s: %s\n", filename.c_str(), "tensor mismatch");
+		std::unique_lock<std::mutex> input_lock(ctx.input_mutex);
-			continue;
+		ctx.work = g.batch;
-		}
+		input_lock.unlock();
 		ctx.input_cv.notify_all();
-		batch.push_back(image);
+		std::unique_lock<std::mutex> output_lock(ctx.output_mutex);
-		if (batch.size() == g.batch) {
+		ctx.output_cv.wait(output_lock,
-			run(batch, config, session, shape);
+			[&]{ return ctx.output.size() == g.batch || ctx.done == g.batch; });
-			batch.clear();
+
 		// It would be possible to add dummy entries to the batch,
 		// so that the model doesn't need to be rebuilt.
 		if (!ctx.output.empty()) {
 			run(ctx.output, config, session, shape);
 			ctx.output.clear();
 		}
 		if (ctx.done == g.batch)
 			break;
 	}
 	if (!batch.empty())
 		run(batch, config, session, shape);
 }
 int
@@ -649,14 +713,19 @@ main(int argc, char *argv[])
 		paths.assign(argv + 1, argv + argc);
 	}
 	// Load batched images in parallel (the first is for GM, the other for IM).
 	if (g.batch > 1) {
 		auto value = std::to_string(
 			std::max(std::thread::hardware_concurrency() / g.batch, 1L));
 		setenv("OMP_NUM_THREADS", value.c_str(), true);
 		setenv("MAGICK_THREAD_LIMIT", value.c_str(), true);
 	}
 	// XXX: GraphicsMagick initializes signal handlers here,
 	// one needs to use MagickLib::InitializeMagickEx()
 	// with MAGICK_OPT_NO_SIGNAL_HANDER to prevent that.
 	//
 	// ImageMagick conveniently has the opposite default.
 	//
 	// Once processing images in parallel, consider presetting
 	// OMP_NUM_THREADS=1 (GM) and/or MAGICK_THREAD_LIMIT=1 (IM).
 	Magick::InitializeMagick(nullptr);
 	OrtLoggingLevel logging = g.debug > 1