#!/usr/bin/env tclsh
# https://wiki.tcl-lang.org/page/exec+magic

# Run this command to see if any threads have gotten stuck.
#
# Simplest use case, one argument, the name of the log file to check.
#
# This script will read the file backwards, looking for ThreadMonitor.C to give
# us periodic status reports on each thread.  We report the latest occurrence
# for each thread.  You would expect each thread to show up about once every
# five minutes.  If you have to go too far back in the logs, that means the
# thread is locked up and not responding to new requests.
#
# Ideally we'd show you the threads we haven't found, because those are the
# bad ones.  But when we first start we don't know a complete list of what
# we are looking for.  Eventually we will go back to a time before the thread
# got stuck, and we'll see the time of its last report.  Just to be complete,
# we also look for Thread Started messages.  Maybe a thread got stuck before
# its first periodic report.  Maybe we started scanning before some threads
# had a chance to report.
#
# Just to be complete we allow you to list multiple files on the command line.
# list these in the order they were created.  (That's the way ls usually lists
# them.)  We will process them in reverse order.  We will stop as soon as we
# find where the server restarted.
#
# Normally we expect a thread to check in about once every 300 seconds.  That's
# why you see 300 in the log in so many places.  If it has a slow job it might
# not be able to check in right away.  We usually try to group the requests
# so that slow jobs and fast jobs won't have to share a thread.  Roughly the
# 10 items or less line.
#
# Note:  ThreadMonitor does not set a timer!  We only check if it's time to
# report when we do other work.  On a typical customer facing system every
# thread is receiving Socket_closed messages pretty frequently, so the updates
# appear about when they should.  On a test server or a
# generate_alerts/realtime_alerts server you might not see a thread for a while
# because it was sleeping and it had no work.  You can always use telnet to
# connect and immediately disconnect for flush all of the threads to the log.

if {[llength $argv] == 0} {
    puts stderr "List one or more log files to be processed, oldest first.\n- means stdin"
    exit 6
}

# TODO can we make the format match.  TCL version includes the timezone.
# From TCL:  {Wed May 20 13:05:01 PDT 2020}
# From Log files:  {Wed May 20 13:05:01 2020}
puts [list [clock format [clock seconds]] now]

set restart_found 0
set files_processed 0
foreach file_name [lreverse $argv] {

    if {[regexp {[^-_.0-9a-zA-Z/]} $file_name]} {
	# TODO I'm sure we could do a better job of this.  I was too lazy to
	# quote the spaces.  I don't remember the rules for that, but it's
	# always tricky with shell commands, doubly so from TCL.  If nothing
	# else I could make the white list bigger.
	puts stderr "Unable to process file "[list $file_name]".\nInvalid file name.\nAborting."
	exit 9
    }
    
    incr files_processed
    puts ${file_name}:
    # TODO This process is slow.  Consider using grep (in the shell command)
    # or regexp in TCL to get rid of a lot of these lines faster.
    # Alternatively a progress indicator should tell us how far back we've
    # gotten in the logs.
    set handle [open "|tac -- $file_name" r]
    set nothing_printed_yet 1
    unset -nocomplain found({})
    while {![eof $handle]} {
	set line [gets $handle]

	try {
	    lindex $line 0
	} on error {} {
	    if {$nothing_printed_yet} {
		# Often the last line of the log file is incomplete and invalid
		# if the server crashed.  That's the first line we'd review!
		# Can't stop there, but a warning is appropriate.
		puts stderr "Ignoring invalid line"
		puts stderr [list $line]
		continue
	    } else {
		# We would have stopped anyway.  Make a better error message.
		# If this a just an invalid file, we'll probably have lots of
		# errors, so that's why I abort.
		puts stderr "Aborting after reading an invalid line"
		puts stderr [list $line]
		exit 13
	    }
	}
	
	if {[lindex $line 2] == {ThreadMonitor.C}} {
	    set thread_name [lindex $line 3]
	    set reason ThreadMonitor.C
	} elseif {([lindex $line 2] == {Thread})
		  && ([lindex $line 3] == {Started})} {
	    set thread_name [lindex $line 4]
	    set reason {Thread Started}
	    set restart_found 1
	} elseif {$nothing_printed_yet} {
	    set thread_name {}
	    set reason {Oldest log entry}
	} else {
	    continue
	}
	# TODO This logic has a flaw.  What if the first line we see in a
	# file is ThreadMonitor.C, but it was not the first for that thread?
	# Seems like the result will be close enough for a debug tool.
	if {![info exists found($thread_name)]} {
	    set found($thread_name) 1
	    puts [list [lindex $line 0] $reason $thread_name]
	    set nothing_printed_yet 0
	}
    }
    close $handle
    if {$restart_found} {
	puts {Restart found! ☻}
	break
    }
}
if {$files_processed != [llength $argv]} {
    puts "Processed $files_processed of [llength $argv] files."
}