#!/usr/bin/env tclsh # https://wiki.tcl-lang.org/page/exec+magic # Run this command to see if any threads have gotten stuck. # # Simplest use case, one argument, the name of the log file to check. # # This script will read the file backwards, looking for ThreadMonitor.C to give # us periodic status reports on each thread. We report the latest occurrence # for each thread. You would expect each thread to show up about once every # five minutes. If you have to go too far back in the logs, that means the # thread is locked up and not responding to new requests. # # Ideally we'd show you the threads we haven't found, because those are the # bad ones. But when we first start we don't know a complete list of what # we are looking for. Eventually we will go back to a time before the thread # got stuck, and we'll see the time of its last report. Just to be complete, # we also look for Thread Started messages. Maybe a thread got stuck before # its first periodic report. Maybe we started scanning before some threads # had a chance to report. # # Just to be complete we allow you to list multiple files on the command line. # list these in the order they were created. (That's the way ls usually lists # them.) We will process them in reverse order. We will stop as soon as we # find where the server restarted. # # Normally we expect a thread to check in about once every 300 seconds. That's # why you see 300 in the log in so many places. If it has a slow job it might # not be able to check in right away. We usually try to group the requests # so that slow jobs and fast jobs won't have to share a thread. Roughly the # 10 items or less line. # # Note: ThreadMonitor does not set a timer! We only check if it's time to # report when we do other work. On a typical customer facing system every # thread is receiving Socket_closed messages pretty frequently, so the updates # appear about when they should. On a test server or a # generate_alerts/realtime_alerts server you might not see a thread for a while # because it was sleeping and it had no work. You can always use telnet to # connect and immediately disconnect for flush all of the threads to the log. if {[llength $argv] == 0} { puts stderr "List one or more log files to be processed, oldest first.\n- means stdin" exit 6 } # TODO can we make the format match. TCL version includes the timezone. # From TCL: {Wed May 20 13:05:01 PDT 2020} # From Log files: {Wed May 20 13:05:01 2020} puts [list [clock format [clock seconds]] now] set restart_found 0 set files_processed 0 foreach file_name [lreverse $argv] { if {[regexp {[^-_.0-9a-zA-Z/]} $file_name]} { # TODO I'm sure we could do a better job of this. I was too lazy to # quote the spaces. I don't remember the rules for that, but it's # always tricky with shell commands, doubly so from TCL. If nothing # else I could make the white list bigger. puts stderr "Unable to process file "[list $file_name]".\nInvalid file name.\nAborting." exit 9 } incr files_processed puts ${file_name}: # TODO This process is slow. Consider using grep (in the shell command) # or regexp in TCL to get rid of a lot of these lines faster. # Alternatively a progress indicator should tell us how far back we've # gotten in the logs. set handle [open "|tac -- $file_name" r] set nothing_printed_yet 1 unset -nocomplain found({}) while {![eof $handle]} { set line [gets $handle] try { lindex $line 0 } on error {} { if {$nothing_printed_yet} { # Often the last line of the log file is incomplete and invalid # if the server crashed. That's the first line we'd review! # Can't stop there, but a warning is appropriate. puts stderr "Ignoring invalid line" puts stderr [list $line] continue } else { # We would have stopped anyway. Make a better error message. # If this a just an invalid file, we'll probably have lots of # errors, so that's why I abort. puts stderr "Aborting after reading an invalid line" puts stderr [list $line] exit 13 } } if {[lindex $line 2] == {ThreadMonitor.C}} { set thread_name [lindex $line 3] set reason ThreadMonitor.C } elseif {([lindex $line 2] == {Thread}) && ([lindex $line 3] == {Started})} { set thread_name [lindex $line 4] set reason {Thread Started} set restart_found 1 } elseif {$nothing_printed_yet} { set thread_name {} set reason {Oldest log entry} } else { continue } # TODO This logic has a flaw. What if the first line we see in a # file is ThreadMonitor.C, but it was not the first for that thread? # Seems like the result will be close enough for a debug tool. if {![info exists found($thread_name)]} { set found($thread_name) 1 puts [list [lindex $line 0] $reason $thread_name] set nothing_printed_yet 0 } } close $handle if {$restart_found} { puts {Restart found! ☻} break } } if {$files_processed != [llength $argv]} { puts "Processed $files_processed of [llength $argv] files." }