#include <stdlib.h>
#include <string.h>
#include <iconv.h>

#include "XmlSupport.h"

/* We were spending A LOT of time converting XML from the internal format to
 * a string.  This was espeically true when trying to send real-time alerts
 * to people immediately after the open.  So I optimized this operation.
 *
 * For my test data the optimized version of asString takes about 1/6 of the
 * time as the un-optimized version.  There were two large components of this
 * change.  By resuing one string, rather than creating a lot of strings, we 
 * reduced the time significantly.  And I got a surprisingly big improvement by
 * using a char * to iterate in htmlspecialchars rather than string::iterator.
 *
 * A previous version of this code tried to use reserve to set the strings to
 * the correct size in advance.  Unfortunately we needed to do a lot of work
 * to get this value correct.  Although it was an improvement over the original
 * code, this version is more than twice as fast.  And this version is much
 * simpler!
 *
 * I also tried switching from a string to a rope.  (I did this before I did
 * any optimizations.)  That change actually made asString take 3 times as
 * long.  That was a big surprise.  Even when I completely disabled
 * htmlspecialchars, the rope version was slower than the original (complete)
 * version.  Based on the implementation described on the SGI web site, I
 * thought this would have been an ideal case for ropes.
 */

#include <stdio.h>

static bool isValidUtf8(std::string const &s)
{
  // mbstowcs() used in this way is surprisingly fast.
  const char *start = s.c_str();
  const char *const end = start + s.size();
  while (start < end)
    {
      if (mbstowcs(NULL, start, 0) == (size_t)-1)
	// We have already found an illegal sequence, so we can stop now.
	return false;
      start += strlen(start) + 1;
    }
  return true;
}

// XML messages can only contain valid strings.  Delphi was pretty lax here,
// taking any character it didn't know as itself.  But Java definately would
// bomb on bad sequences, and presumably C# will as well.
//
// There are three seperate issues here.
//
// 1)  XML doesn't like most control codes, i.e. characters less than 32.  We
//     solve that by completely removing those code.  These values *are* valid
//     UTF-8 characters.  I'm not sure how these get in there, but we did
//     have a problem once, so we added this step.  I've never seen this step
//     cause a problem.
//
//     I'm not sure if this step is complete.  There are other characters
//     which are listed as invalid or discouraged in an XML document.  I
//     haven't seen a problem, but maybe it's worth testing.  To handle all of
//     these, we'd have to change the algorithm.  I'm treating 0x7f as a bad
//     character and throwing it out because it is easy and I don't think that
//     will hurt.
//
// 2) There are a few characters like < and > which would confuse XML. We quote
//    those more or less like PHP's htmlspecialchars() function.  This always
//    works perfectly.
//
// 3) The database might contain strings which are not valid UTF-8.  Typically
//     stuff from the web site is UTF-8, where stuff from (the Delphi version
//     of) TI Pro is in an encoding which varies from one computer to the next.
//     That encoding is never shared with the server; the server just copies
//     the values from the client, stores them, and sends them back, verbatim.
//
//     Since we are not 100% sure what's in the database, we have to make some
//     assumptions.  First we check if the string could be interpreted as a
//     valid UTF-8 string.  In that case we send the string as is.
//
//     If the string is not valid as UTF-8, then we quote every byte that is
//     greater than 127.  We quote it as a number, not a name.  A parser that
//     works with unicode will convert those into latin-1 characters, which is
//     a reasonable guess at what they are.  The Dephi parser will convert
//     these back into the original number, so the copy will look just like
//     the original.
//
//     This strategy will always generate XML that is legal.  It will usually
//     generate values which are correct.  I suppose it could fail because
//     the special characters just happen to look like valid UTF-8 when in
//     fact they were created by another encoding, but that doesn't seem
//     likely.  Also, if someone was using an encoding other than latin-1 (like
//     hebrew) on the Delphi client, and saved values, then tried to load his
//     settings from Java or C#, he'll find his settings have been changed into
//     random Latin-1 characters.  That last case is bound to happen, but not
//     a lot.  It's the best we can do.
//
//     Of course, if something was stored in UTF-8 by a client that knows
//     unicode, and then the user switches to the Delphi client, his special
//     characters will all turn into random Latin-1.  
//
// Note:  The quoting described here only works if your local is set properly.
// Normally it is enough to say setlocale(LC_ALL, "") in main(), assuming you
// environment variables are not too crazy.  If you forget that step, however,
// things won't be so bad.  By default, C++ uses a locale with only characters
// between 0 and 127.  The Delphi clients will see pretty much the same thing
// as always.  The unicode clients tring to read valid unicode stuff from the
// database will see the problem with random Latin-1 characters.  Even if they 
// were only using valid Latin-1 characters, the message will still not be
// correct.  Of course, if the message only contains characters between 0
// and 127, everything will work fine.

static void xmlQuote(std::string &output, std::string const &input)
{
  const char * const inputStart = input.data();
  const char * const inputEnd = inputStart + input.size();
  const bool validUtf8 = isValidUtf8(input);
  for (char const *it = inputStart; it != inputEnd; it++)
    {
      switch(*it)
	{
	case '&' :
	  output.append("&amp;", 5);
	  break;
	case '"' :
	  output.append("&quot;", 6);
	  break;
	case '<' :
	  output.append("&lt;", 4);
	  break;
	case '>' :
	  output.append("&gt;", 4);
	  break;
	case '\t' :
	  // These seem the be the only valid control characters in XML.
	  // The others will cause a problem, even if they are quoted.
	  // http://www.w3.org/TR/xml11/#charsets
	  // http://www.w3.org/TR/REC-xml/#charsets
	  output += *it;
	  break;
	case '\n' :
	  // Some XML readers seem to turn \r and \n into " ".
	  // Mike found this problem with GWT.
	  // I think I saw some similar problems where \r\n were being changed
	  // into \n, or something similar, but only on certain browsers.
	  output.append("&#10;", 5);
	  break;
	case '\r' :
	  output.append("&#13;", 5);
	  break;
	default :
	  {
	    int ch = (unsigned char)(*it);
	    if ((ch < 32) || (ch == 127))
	      { // Ignore all invalid control characters.  Do not try to
		// quote them.  It will change the error message, but the
		// Java code still cannot deal with them.
		// http://bytes.com/forum/thread86931.html
		// The Java code definately bombed on charactar 0x0b.  I got
		// a different error message when I quoted it, but it still
		// failed.
	      }
	    else if (validUtf8 || (ch < 127))
	      { // Normal printable ASCII or UTF-8 characters. 
		output += *it;
	      }
	    else
	      { // The Delphi client will convert these back to the bytes that
		// we see here.  For clients that understand unicode, this
		// translation will be correct as long as the source was in
		// Latin-1.
		char buffer[30];
		// According to
		// http://www.htmlhelp.com/reference/html40/entities/
                // browsers offer better support for decimal notation than hex.
		int count = sprintf(buffer, "&#%d;", ch);
		output.append(buffer, count);
		// I assume that this option will not add a huge performance
		// penalty, since this only applies to occasional messages,
		// like the contents of the config window.  It will not apply
		// to the alert messages which make up the bulk of our data.
	      }
	  }
	}
    }
}

// cdataQuote is an alternative to xmlQuote.  The output is designed to be
// used in XML.  In some cases the output might be more readable and/or
// shorter than from xmlQuote.
//
// For the most part, cdataQuote just wraps the input between <![CDATA[
// and ]]> tags.  But there are some special cases.  Most obviously, if
// we have ]]> in the input, we have to do some special logic to that.
//
// As with xmlQuote, we ensure that the output only contains valid characters.
// If we see something that is not valid in the input, we try to fix it.
// As with xmlQuote, invalid control characters are deleted.  If we see 
// invalid bytes above the valid ASCII range, we assume they are LATIN-1 and
// convert them into unicode.  Unlike xmlQuote, this code will try to convert
// the LATIN-1 codes into equivalent UTF-8 codes.  There are some codes that
// are valid but meaninless, such as character 127.  That is to say that they
// don't hurt the XML parser (like a null would) but they don't convey any
// meaning, either.  cdataQuote removes all of these when translatings,
// not just the ones it has to.
//
// The result is that cdataQuote will correctly encode valid LATIN-1 data,
// in case some is still hiding in the database.  And it will always spit out
// valid XML.  But it doesn't deal with some of the special cases dealing with
// Delphi and other character sets.  It would be better to call xmlQuote if you
// think you are talking with the Delphi code.  cdataQuote is initially aimed
// at our javascript proxy.  Note that the main program has to choose to
// use cdata quoting.  We don't try to pick it automatically.

class CdataQuoter
{
private:
  std::string utf8Convert[256];
  std::string latin1Convert[256];
  static const std::string start;
  static const std::string restart;
  static const std::string end;
public:
  CdataQuoter();
  void quote(std::string &output, std::string const &input) const;
};

const std::string CdataQuoter::start = "<![CDATA[";
const std::string CdataQuoter::restart = "]]]]><![CDATA[>";
const std::string CdataQuoter::end = "]]>";

void CdataQuoter::quote(std::string &output, std::string const &input) const
{
  output += start;
  const bool validUtf8 = isValidUtf8(input);
  std::string const * const convert = validUtf8?&utf8Convert[0]:&latin1Convert[0];
  char const *current = input.c_str();
  char const *const stop = current + input.size();
  while (current < stop)
    if (current[0] == ']' && current[1] == ']' && current[2] == '>')
      {
	output += restart;
	current += 3;
      }
    else
      {
	output += convert[(unsigned char)*current];
	current++;
      }
  output += end;
}

CdataQuoter::CdataQuoter()
{
  // I believe these are the only valid control characters in XML.
  utf8Convert[(int)'\t'] = latin1Convert[(int)'\t'] = "\t";
  utf8Convert[(int)'\r'] = latin1Convert[(int)'\r'] = "\r";
  utf8Convert[(int)'\n'] = latin1Convert[(int)'\n'] = "\n";
  // Normal ASCII characters.
  for (int i = 32; i < 127; i++)
    utf8Convert[i] = latin1Convert[i] = std::string(1, (char)i);
  // If we know that the input was valid utf-8, we copy these byte for
  // byte.
  for (int i = 128; i < 256; i++)
    utf8Convert[i] = std::string(1, (char)i);
  // If we don't have UTF-8, we assume we have latin-1 and convert that to
  // UTF-8
  iconv_t cd = iconv_open("UTF8", "LATIN1");
  for (int i = 160; i < 256; i++)
    {
      iconv(cd, NULL, NULL, NULL, NULL);
      char input = i;
      char *inputPtr = &input;
      char output[5];
      char *outputPtr = output;
      size_t inputLeft = 1;
      size_t outputLeft = 5;
      size_t result = 
	iconv(cd, &inputPtr, &inputLeft, &outputPtr, &outputLeft);
      if (result != (size_t)-1)
	latin1Convert[i] = std::string(output, outputPtr - output);
    }
  iconv_close(cd);
}

static CdataQuoter cdataQuoter;

bool XmlNode::binarySafe(std::string const &data)
{
  if (!isValidUtf8(data))
    return false;
  for (std::string::const_iterator it = data.begin();
       it != data.end();
       it++)
    {
      const unsigned char ch = *it;
      if ((ch < ' ') && (ch != '\n') && (ch != '\r') && (ch != '\t'))
	return false;
      if (ch == 127)
	// Treat delete just like any other control character.
	return false;
    }
  return true;
}

std::string XmlNode::asString(const std::string &recommendedName) const
{
  std::string result;
  // Surprisingly, adding a reserve statement does not help by any 
  // measurable amount.  In other versions of this code, when we were
  // creating a lot more strings, reserve was very helpful.
  //result.reserve(14000);
  addToString(result, recommendedName);
  return result;
}

static const std::string s_NODE="NODE";

void XmlNode::addToString(std::string &output, const std::string &recommendedName) const
{
  std::string realName;
  if (name.empty())
    {
      realName = recommendedName;
    }
  else
    {
      realName = name;
    }
  // We should really check to make sure that the name is valid.  Due to a bug
  // we sometimes saw a name that was "2", or similar.  The Delphi code seems
  // to ignore this.  The Java code threw an exception.
  output += '<';
  output += realName;
  for (PropertyList::const_iterator property = properties.begin();
       property != properties.end();
       property++)
    {
      output += ' ';
      output += property->first;
      output.append("=\"", 2);
      xmlQuote(output, property->second);
      output += '"';
    }
  if (namedChildren.empty() && orderedChildren.empty() && text.empty())
    {
      output.append(" />", 3);
    }
  else
    {
      output += '>';
      for (std::map< std::string, XmlNode >::const_iterator namedChild = 
	     namedChildren.begin();
	   namedChild != namedChildren.end();
	   namedChild++)
	{
	  namedChild->second.addToString(output, namedChild->first);
	}
      for (std::vector< XmlNode >::const_iterator orderedChild =
	     orderedChildren.begin();
	   orderedChild != orderedChildren.end();
	   orderedChild++)
	{
	  orderedChild->addToString(output, s_NODE);
	}
      if (useCdata)
	cdataQuoter.quote(output, text);
      else
	xmlQuote(output, text);
      output.append("</", 2);
      output += realName;
      output += '>';
    }
}

void XmlNode::clear()
{
  name.clear();
  properties.clear();
  namedChildren.clear();
  orderedChildren.clear();
  text.clear();
}

bool XmlNode::empty() const
{
  return name.empty() 
    && properties.empty() 
    && namedChildren.empty() 
    && orderedChildren.empty() 
    && text.empty();
}

#ifdef __UNIT_TEST_

// g++ -Wall -ggdb -D__UNIT_TEST_ XmlSupport.C MiscSupport.C

#include <iostream>

void test(XmlNode &parent, std::string name, std::string value)
{
  XmlNode &node = parent[-1];
  node.properties["name"] = name;
  node["default"].text = value;
  XmlNode &cdata = node["cdata"];
  cdata.text = value;
  cdata.useCdata = true;
  node.properties["mode"] = XmlNode::binarySafe(value)?"text":"binary";
}

int main(int, char **)
{
  setlocale(LC_ALL, "");
  XmlNode main;
  test(main, "empty", "");
  test(main, "simple", "simple");
  test(main, "special", "&\"<>");
  test(main, "real top list", "<API><TOPLIST SHORT_FORM=\"form=1&amp;show0=D_Symbol&amp;col_ver=1&amp;sort=MaxRV&amp;X_NYSE=on&amp;X_ARCA=on&amp;X_AMEX=on&amp;XN=on\" TYPE=\"info\" WINDOW=\"TL1\" WINDOW_NAME=""><COLUMNS><c_D_Symbol CODE=\"D_Symbol\" DESCRIPTION=\"Symbol\" FORMAT=\"\" TEXT_FIELD=\"1\" TEXT_HEADER=\"1\" de_DESCRIPTION=\"Kürzel\" /></COLUMNS><SORT_BY FIELD=\"RV\" /></TOPLIST></API>");
  test(main, "XML", "<API><FIRST NAME=\"philip\">philip</FIRST><MORE>&amp;&lt</MORE></API>");
  std::string toTest;
  for (int i = 32; i < 127; i++)
    toTest += (char)i;
  test(main, "ascii", toTest);
    toTest += "¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
  test(main, "utf8", toTest);
  toTest.clear();
  for (int i = 1; i < 256; i++)
    toTest += (char)i;
  test(main, "all bytes but null", toTest);
  toTest.clear();
  for (int i = '~'; i < 165; i++)
    toTest += (char)i;
  test(main, "past ascii", toTest);
  toTest = '\0' + toTest;
  test(main, "null & past ascii", toTest);
  test(main, "cdata", "<![CDATA[<MAIN><TEST /></MAIN>]]>");
  for (int i = 0; i < 256; i++)
    test(main, "byte " + ntoa(i), std::string(1, i));
  std::cout<<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"<<main.asString()<<std::endl;
}

#endif