#!/usr/bin/env perl # To speed up processing of large files, GNU parallel can be used: # $ parallel --pipe -k json-format.pl INPUT use strict; use warnings; use Term::ANSIColor; use Getopt::Long; my $reset = color('reset'); my %format = ( FIELD => color('bold'), NULL => color('cyan'), BOOL => color('red'), NUMBER => color('magenta'), STRING => color('blue'), ERROR => color('bold white on_red'), ); my $color = 'auto'; my $help; if (!GetOptions('color=s' => \$color, 'help' => \$help) || $help) { print STDERR "Usage: $0 [OPTION...] [FILE...]\n" . "Pretty-print and colorify JSON\n" . "\n" . " --help print this help\n" . " --color=COLOR 'always', 'never' or 'auto' (the default)\n"; exit 2; } %format = () if $color eq 'never' || $color eq 'auto' && !-t STDOUT; # Hash lookup is the fastest way to qualify tokens, however it cannot be used # for everything and we need to fall back to regular expressions my %lookup = ( '[' => 'LBRACKET', '{' => 'LBRACE', ']' => 'RBRACKET', '}' => 'RBRACE', ':' => 'COLON', ',' => 'COMMA', 'true' => 'BOOL', 'false' => 'BOOL', 'null' => 'NULL', ); my @pats = ( ['"(?:[^\\\\"]*|\\\\(?:u[\da-f]{4}|["\\\\/bfnrt]))*"' => 'STRING'], ['-?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?' => 'NUMBER'], ['[ \t\r\n]+' => 'WS'], ); my @tokens = map {[qr/^$_->[0]$/s, $_->[1]]} @pats; # m//g is the fastest way to explode text into tokens in the first place # and we need to construct an all-encompassing regular expression for it my @all_pats = map {$_->[0]} @pats; push @all_pats, quotemeta for keys %lookup; my $any_token = qr/\G(${\join '|', @all_pats})/; # FIXME: this probably shouldn't be a global variable my $indent = 0; sub nexttoken ($) { my $ref = shift; return unless @$ref; my $text = shift @$ref; if (my $s = $lookup{$text}) { return $s, $text; } for my $s (@tokens) { return $s->[1], $text if $text =~ $s->[0]; } return 'ERROR', $text; } sub printindent () { print "\n"; print ' ' x $indent; } sub do_value ($$$); sub do_object ($) { my $json = shift; my $in_field_name = 1; my $first = 1; while (my ($token, $text) = nexttoken $json) { if ($token eq 'COLON') { $in_field_name = 0; } elsif ($token eq 'COMMA') { $in_field_name = 1; } elsif ($token eq 'STRING') { $token = 'FIELD' if $in_field_name; } if ($token eq 'RBRACE') { $indent--; printindent; } elsif ($first) { printindent; $first = 0; } do_value $token, $text, $json; return if $token eq 'RBRACE'; } } sub do_array ($) { my $json = shift; my $first = 1; while (my ($token, $text) = nexttoken $json) { if ($token eq 'RBRACKET') { $indent--; printindent; } elsif ($first) { printindent; $first = 0; } do_value $token, $text, $json; return if $token eq 'RBRACKET'; } } sub do_value ($$$) { my ($token, $text, $json) = @_; if (my $format = $format{$token}) { print $format, $text, $reset; } else { print $text; } if ($token eq 'LBRACE') { $indent++; do_object $json; } elsif ($token eq 'LBRACKET') { $indent++; do_array $json; } elsif ($token eq 'COMMA') { printindent; } elsif ($token eq 'COLON') { print ' '; } } while (<>) { # FIXME: this way it doesn't work with pre-formatted JSON my $json = $_; my @matches = $json =~ /$any_token/gsc; push @matches, substr $json, pos $json if pos $json != length $json; while (my ($token, $text) = nexttoken \@matches) { next if $token eq 'WS'; do_value $token, $text, \@matches; } print "\n"; }