#include #include "../shared/ThreadMonitor.h" #include "OneTimeTimeout.h" ///////////////////////////////////////////////////////////////////// // OneTimeTimeoutRequest ///////////////////////////////////////////////////////////////////// void OneTimeTimeoutRequest::send() { callbackId = _finalId; _finalDestination->newRequest(this); } ///////////////////////////////////////////////////////////////////// // OneTimeTimeoutThread ///////////////////////////////////////////////////////////////////// // The request must exist. We assert that. That didn't have to be // a requirement. But that's how it's used. And I'm hunting a core // dump. So I assert what I expect to be true. void OneTimeTimeoutThread::remove(OneTimeTimeoutRequest *request) { int removed = 0; removed += _byTime.erase(ByTimeItem(request->getTimeout(), request)); removed += _bySocket.erase(BySocketItem(request->getSocketInfo(), request)); removed += _byRequest.erase(request); assert(removed == 3); } static const std::string s_removed_by_socket = "removed_by_socket"; void OneTimeTimeoutThread::remove(SocketInfo *socket) { while (true) { const BySocket::iterator it = _bySocket.lower_bound(BySocketItem(socket, NULL)); if ((it == _bySocket.end()) || (it->first != socket)) return; OneTimeTimeoutRequest *const request = it->second; remove(request); // The previous line will invalidate our iterator, so don't try ++ on it. delete request; ThreadMonitor::find().increment(s_removed_by_socket); } } void OneTimeTimeoutThread::add(OneTimeTimeoutRequest *request) { // Please look at the changes in revision 1.21 of Messages.C. There are // a lot of notes in this file about different types of mysterious failures. // I'm pretty sure there was no problem in this file, and the bug was // actually in Messages.C // // The following assertion failed on me on 8/5/2016 at 11:00am. // // That wasn't a complete surprise. We had a couple of similar // failures right after pulling an object out of these data // structures. The new failure suggests that the object was // bad here, before we inserted it, rather than expiring while // it was in this structure. // // But the recent core dump didn't completely make sense. // There were two strange things going on. // // 1) The core dump made it clear that the assertion on the // next line failed. But I used the debugger to check on // the details. objectIsValid() is a trivial function // which I could emulate in the debugger. According to my // tests, objectIsValid() should have returned true. // // 2) If you walk backwards from here you can see that we just // received the request from a normal event queue. So // it should have been a good request with a good socket. // Look at "case mtSubmit" below. It's trivial. I don't // see how it could have failed. // // One more detail. I don't know if it's important, but it // seemed interesting. I looked at the SocketInfo object in // the debugger. All of the fields looked reasonable, like // it wasn't just random memory. But the interesting part is // that the going down flag was set to true. I don't see how // that helps, but it does seem unusual. // // I'm looking at another core dump. This one from August 10, // 2016, 8:17am. This one is different from what I described // above. This time the socket serial number was 0, the file // handle was 0, eof was false, going down was false. Using // _validator I can see that the socket's serial number was // 11150862 and _lastSerialNumber was exactly 1 higher than // that. assert(request->getSocketInfo()->objectIsValid()); _byTime.insert(ByTimeItem(request->getTimeout(), request)); _bySocket.insert(BySocketItem(request->getSocketInfo(), request)); _byRequest.insert(request); } static const std::string s_fired = "fired"; void OneTimeTimeoutThread::threadFunction() { const std::string s_SUBMIT = "mtSubmit"; ThreadMonitor &tm = ThreadMonitor::find(); while (true) { _incoming.resetWaitHandle(); while (Request *current = _incoming.getRequest()) { if (SocketInfo *socket = current->getSocketInfo()) // I've seen this fail twice recently on dice. (Oct 19 10:11 & // Oct 17 11:04) In one case it looked like the SocketInfo had // been destroyed and partially overwritten. In the other case // the socket looked perfect. It wasn't clear why the assertion // failed. Best guess, someone deleted the socket, then we called // objectIsValid(), which failed, then another thread reused that // memory and put another socket there, before assert() finished. assert(socket->objectIsValid()); switch (current->callbackId) { case mtSubmit: { tm.increment(s_SUBMIT); OneTimeTimeoutRequest *request = dynamic_cast(current); add(request); current = NULL; break; } case mtQuit: assert(false && "not implemented"); break; case DeleteSocketThread::callbackId: { SocketInfo *socket = current->getSocketInfo(); remove(socket); break; } } delete current; } while (true) { if (_byTime.empty()) { _incoming.waitForRequest(); break; } ByTimeItem const &next = *_byTime.begin(); timeval waitTime = next.first.waitTime(); if (waitTime.tv_sec || waitTime.tv_usec) { _incoming.waitForRequest(&waitTime); break; } OneTimeTimeoutRequest *const request = next.second; remove(request); // The next line caused a core dump a couple of times. The socket // associated with the request was already closed. request->send(); // Still having trouble. I added some debug stuff to the code. The // last build and install was on 8/17/2016. It is now 9/11/2016. // I've seen two core dumps since that time. They were both on dice. // One was at 4:05 am on 8/31/2016. The second was exactly 22 // minutes later. // // In both cases the OneTimeTimeoutRequest looked good, aside from // pointing to a bad SocketInfo. In one case the entire SocketInfo // object had been overwritten. In the other case the first part of // the object was clearly overwritten. The second part showed that // it was a valid object which had been properly destroyed. // // (In particular, the id was overwritten with 0, as is done in the // destructor. But the OneTimeTimeoutRequest contains a copy of // that which doesn't get overwriten, so I could check the // SocketInfo::_validator field. That 64 bit field was perfect.) // // I also checked the logs. We get here a lot. s_fired happens // constantly. Approximately 10% of all OneTimeTimeoutRequest // objects end this way. (The ratio is slightly higher during market // hours but slightly lower in the middle of the night.) tm.increment(s_fired); } } } OneTimeTimeoutThread::OneTimeTimeoutThread() : ThreadClass("OneTimeTimeoutThread"), _incoming("OneTimeTimeoutThread") { startThread(); } OneTimeTimeoutThread::~OneTimeTimeoutThread() { // This should never be called. We could add a standard shutdown routine, // but it would be pointless. assert(false && "Not Implemented"); } void OneTimeTimeoutThread::submit(OneTimeTimeoutRequest *request) { request->callbackId = mtSubmit; _incoming.newRequest(request); } OneTimeTimeoutThread *OneTimeTimeoutThread::_instance = NULL; void OneTimeTimeoutThread::initialize() { // For simplicity we require the main thread to initialize this thread. // That avoids the need for locks, and it's not inconsistent with our // normal initialization procedures. assert(!_instance); _instance = new OneTimeTimeoutThread(); }