Add a rudimentary CMake script parser

2022-09-27 17:13:45 +02:00
parent 688c458095
commit af2756ee01
3 changed files with 278 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,3 +52,7 @@ foreach (name ${tests})
 	target_link_libraries (test-${name} ${common_libraries})
 	add_test (NAME test-${name} COMMAND test-${name})
 endforeach ()
+
+add_test (test-cmake-parser
+	env LC_ALL=C awk -f ${PROJECT_SOURCE_DIR}/tools/cmake-parser.awk
+	-f ${PROJECT_SOURCE_DIR}/tools/cmake-dump.awk ${CMAKE_CURRENT_LIST_FILE})
--- a/tools/cmake-dump.awk
+++ b/tools/cmake-dump.awk
@@ -0,0 +1,24 @@
+# cmake-dump.awk: dump parsed CMake scripts as tables
+#
+# Copyright (c) 2022, Přemysl Eric Janouch <p@janouch.name>
+# SPDX-License-Identifier: 0BSD
+#
+# Parsed scripts are output in a table, with commands separated using ASCII
+# Record Separators, and arguments using Unit Separators.
+#
+# Example usage: awk -f cmake-parser.awk -f cmake-dump.awk CMakeLists.txt \
+#  | sed 'y/\x1F\x1E\t\n/\t\n  /' \
+#  | sed -n '/^project\t\([^\t]*\).*\tVERSION\t\([^\t]*\).*/{s//\1 \2/p;q;}'
+
+function sanitize(s) {
+	if (s ~ /[\x1E\x1F]/)
+		fatal("conflicting ASCII control characters found in source")
+	return s
+}
+
+Command {
+	out = sanitize(Command)
+	for (i in Args)
+		out = out "\x1F" sanitize(Args[i])
+	printf "%s\x1E", out
+}
--- a/tools/cmake-parser.awk
+++ b/tools/cmake-parser.awk
@@ -0,0 +1,250 @@
+# cmake-parser.awk: rudimentary CMake script parser
+#
+# Copyright (c) 2022, Přemysl Eric Janouch <p@janouch.name>
+# SPDX-License-Identifier: 0BSD
+#
+# Implemented roughly according to the grammar described in cmake-language(7),
+# which is self-conflicting, and not an accurate description.
+#
+# The result of parsing is stored in the case-normalized Command variable,
+# and the Args array.  These can be used by subsequent scripts.
+
+function warning(message) {
+	print FILENAME ":" FNR ": warning: " message > "/dev/stderr"
+}
+
+function fatal(message) {
+	print FILENAME ":" FNR ": fatal error: " message > "/dev/stderr"
+	exit 1
+}
+
+function expect(v) {
+	if (!v && v == 0)
+		fatal("broken expectations at `" $0 "'")
+	return v
+}
+
+function literal(v) {
+	if (substr($0, 1, length(v)) != v)
+		return 0
+	$0 = substr($0, length(v) + 1)
+	return 1
+}
+
+function regexp(re) {
+	if (!match($0, "^" re))
+		return 0
+	$0 = substr($0, RLENGTH + 1)
+	return 1
+}
+
+function space() {
+	return regexp("[ \t]+")
+}
+
+function unbracket(len,    v) {
+	do {
+		if (match($0, "]={" len "}]")) {
+			v = v substr($0, 1, RSTART - 1)
+			$0 = substr($0, RSTART + RLENGTH)
+			return v
+		}
+		v = v $0 RS
+	} while (getline > 0)
+	fatal("unterminated bracket")
+}
+
+function bracket_comment() {
+	if (!match($0, /^#\[=*\[/))
+		return 0
+	$0 = substr($0, RSTART + RLENGTH)
+	unbracket(RLENGTH - 3)
+	return 1
+}
+
+function line_ending() {
+	while (space() || bracket_comment()) {}
+	if (/^#/)
+		$0 = ""
+	return !$0
+}
+
+# ------------------------------------------------------------------------------
+
+# While elementary expansion of previously set variables is implementable,
+# it doesn't seem to be worth the effort.
+function expand(s,    v) {
+	v = s
+	while (match(v, /\\*[$](|ENV|CACHE)[{]/)) {
+		if (index(substr(v, RSTART), "$") % 2 != 0) {
+			warning("variable expansion is not supported: " s)
+			return s
+		}
+		v = substr(v, RSTART + RLENGTH)
+	}
+	return s
+}
+
+function escape_sequence(    v) {
+	if (!literal("\\"))
+		return 0
+
+	if (literal("t")) return "\t"
+	if (literal("r")) return "\r"
+	if (literal("n")) return "\n"
+
+	# escape_semicolon isn't treated any specially here.
+	if (regexp("[A-Za-z0-9]"))
+		fatal("unsupported escape sequence")
+
+	if ($0) {
+		v = substr($0, 1, 1)
+		$0 = substr($0, 2)
+		return v
+	}
+	if (getline > 0)
+		return ""
+	fatal("premature end of file")
+}
+
+function quoted_argument(    v, unescaped) {
+	if (!literal("\""))
+		return 0
+
+	v = ""
+	while (!literal("\"")) {
+		if (!$0) {
+			if (getline <= 0)
+				fatal("premature end of file")
+			v = v RS
+		} else if ((unescaped = escape_sequence())) {
+			if (unescaped == "\\" || unescaped == "$")
+				v = v "\\"
+			else if (unescaped == ";")
+				v = v "\\\\"
+			v = v unescaped
+		} else if (unescaped == "") {
+			# quoted_continuation
+		} else {
+			v = v substr($0, 1, 1)
+			$0 = substr($0, 2)
+		}
+	}
+	return v
+}
+
+function unquoted_argument(    v, unescaped) {
+	while (1) {
+		if (match($0, /^[^[:space:]()#"\\]+/)) {
+			v = v substr($0, RSTART, RLENGTH)
+			$0 = substr($0, RSTART + RLENGTH)
+		} else if ((unescaped = escape_sequence())) {
+			if (unescaped == "\\" || unescaped == "$" || unescaped == ";")
+				v = v "\\"
+			v = v unescaped
+		} else if (unescaped == "") {
+			fatal("unexpected backslash in an unquoted argument")
+		} else {
+			# unquoted_legacy is not supported.
+			return v
+		}
+	}
+}
+
+# Note that we keep and reprocess some escape sequences in here.
+function argument(    arg, expanded, v) {
+	if (regexp("\\[=*\\[")) {
+		Args[++N] = unbracket(RLENGTH - 2)
+		return 1
+	}
+	if ((arg = quoted_argument()) || arg == "") {
+		expanded = expand(arg)
+		while (match(expanded, /\\./)) {
+			v = v substr(expanded, 1, RSTART - 1) \
+				substr(expanded, RSTART + 1, 1)
+			expanded = substr(expanded, RSTART + RLENGTH)
+		}
+		Args[++N] = v expanded
+		return 1
+	}
+	if ((arg = unquoted_argument())) {
+		expanded = expand(arg)
+		while (expanded) {
+			if (expanded ~ /^;/) {
+				if (v)
+					Args[++N] = v
+				v = ""
+				expanded = substr(expanded, 2)
+			} else if (expanded ~ /^\\./) {
+				v = v substr(expanded, 2, 1)
+				expanded = substr(expanded, 3)
+			} else {
+				v = v substr(expanded, 1, 1)
+				expanded = substr(expanded, 2)
+			}
+		}
+		if (v)
+			Args[++N] = v
+		return 1
+	}
+	return 0
+}
+
+# ------------------------------------------------------------------------------
+
+function identifier(    v) {
+	if (!match($0, /^[A-Za-z_][A-Za-z0-9_]*/))
+		return 0
+	v = substr($0, 1, RLENGTH)
+	$0 = substr($0, RLENGTH + 1)
+	return v
+}
+
+function separation() {
+	if (space() || bracket_comment())
+		return 1
+
+	if (!line_ending())
+		return 0
+	if (getline > 0)
+		return 1
+	fatal("premature end of file")
+}
+
+function command_invocation(    level) {
+	while (space()) {}
+	Command = identifier()
+	if (!Command)
+		return 0
+	while (space()) {}
+
+	Command = tolower(Command)
+	for (N in Args)
+		delete Args[N]
+
+	N = 0
+	expect(literal("("))
+	while (1) {
+		while (separation()) {}
+		if (literal(")")) {
+			if (!level--)
+				break
+			Args[++N] = ")"
+			continue
+		}
+		if (literal("(")) {
+			level++
+			Args[++N] = "("
+			continue
+		}
+		expect(argument())
+		if (!/^[()]/)
+			expect(separation())
+	}
+	return 1
+}
+
+{
+	command_invocation()
+	expect(line_ending())
+}