#define DEFINE_GLOBALS
#include "streme-utils.h"
#include "red-black-tree.h"
#include "motif-db.h"

static char *DEFAULT_OUTPUT_DIRNAME = "enr_out";

/*****************************************************************************
 * Print a usage message with an optional reason for failure.
 *****************************************************************************/
static void usage(char *format, ...) {
  va_list argp;
  char *usage =
    "\n"
    "Usage: enr [options]\n"
    "\n"
    "   Options:\n"
    "     --p <filename>           positive sequence file name (required)\n"
    "     --m <filename>           motif file name (required, may be repeated)\n"
    "     --objfun de|cd           objective function (<objfun>)\n"
    "                                de : Differential Enrichment\n"
    "                                cd : Central Distance\n"
    "                              default: de\n"
    "     --n <filename>           negative sequence file name;\n"
    "                              defaults: if --n is not given, then STREME\n"
    "                              creates negative sequences as follows:\n"
    "                                <objfun> = de, shuffle positive sequences\n"
    "                                         = cd, no negative sequences allowed\n"
    "     --kmer <kmer>            shuffle positive sequences preserving k-mers\n"
    "                              and the positions of the non-core characters\n"
    "                              default: %d (DNA), %d (RNA), %d (Prot), %d (custom)\n"
    //"     --o <output_dir>         output directory; default: '%s'\n"
    //"     --oc <output_dir>        allow overwriting; default: '%s'\n"
    //"     --text                   output text only; overrides --o and --oc;\n"
    "                              default: create text, HTML and XML files in <output_dir>\n"
    "     --dna                    sequences use standard DNA alphabet (default)\n"
    "     --rna                    sequences use standard RNA alphabet\n"
    "     --protein                sequences use standard protein alphabet\n"
    "     --alph <alph_file>       sequences use alphabet defined in <alph_file>;\n"
    "                              converts to uppercase unless both cases in core\n"
    "     --hofract <hofract>      fraction of sequences in hold-out set;\n"
    "                              default: %g\n"
    "     --seed <seed>            random seed for shuffling sequences;\n"
    "                              default: %d\n"
    "     --bfile <bfile>          background model file; default: motif file freqs\n"
    "     --pseudocount <pc>       pseudocount for creating PWMs from motifs;\n"
    "                              default: %g\n"
    "     --inc <pattern>          name pattern to select as motif; may be\n"
    "                              repeated; default: all motifs are used\n"
    "     --exc <pattern>          name pattern to exclude as motif; may be\n"
    "                              repeated; default: all motifs are used\n"
    "     --verbosity 1|2|3|4|5    level of diagnostic output (default: %d)\n"
    "     --version                print the program version and exit\n"
    "\n"
    ;
  if (format) {
    fprintf(stderr, "\n");
    va_start(argp, format);
    vfprintf(stderr, format, argp);
    va_end(argp);
    fprintf(stderr, "\n");
  }
  fprintf(stderr, usage, DEFAULT_DNA_KMER, DEFAULT_RNA_KMER, DEFAULT_PROT_KMER, DEFAULT_CUSTOM_KMER,
    DEFAULT_OUTPUT_DIRNAME, DEFAULT_OUTPUT_DIRNAME,
    DEFAULT_HOFRACT, DEFAULT_SEED, DEFAULT_PSEUDOCOUNT, DEFAULT_VERBOSITY);
  fflush(stderr);
  exit(EXIT_FAILURE);
} // usage

// An easy way of assigning a number to each option.
// Be careful that there are not more than 62 options as otherwise the value
// of '?' will be used.
enum Opts {
  OPT_M, OPT_P, OPT_N, OPT_KMER, OPT_O, OPT_OC, OPT_TEXT, OPT_OBJFUN,
  OPT_DNA, OPT_RNA, OPT_PROTEIN, OPT_ALPH,
  OPT_BFILE, OPT_PSEUDOCOUNT, OPT_INC, OPT_EXC,
  OPT_HOFRACT, OPT_SEED, OPT_VERBOSITY, OPT_VERSION
};

/***********************************************************************
 Process command line options and return the command line string.
 ***********************************************************************/
static char *process_command_line(int argc, char* argv[], STREME_OPTIONS_T *options) {
  int i;
  struct option streme_options[] = {
    {"m", required_argument, NULL, OPT_M},
    {"p", required_argument, NULL, OPT_P},
    {"n", required_argument, NULL, OPT_N},
    {"kmer", required_argument, NULL, OPT_KMER},
    {"o", required_argument, NULL, OPT_O},
    {"oc", required_argument, NULL, OPT_OC},
    {"text", no_argument, NULL, OPT_TEXT},
    {"objfun", required_argument, NULL, OPT_OBJFUN},
    {"dna", no_argument, NULL, OPT_DNA},
    {"rna", no_argument, NULL, OPT_RNA},
    {"protein", no_argument, NULL, OPT_PROTEIN},
    {"alph", required_argument, NULL, OPT_ALPH},
    {"hofract", required_argument, NULL, OPT_HOFRACT},
    {"seed", required_argument, NULL, OPT_SEED},
    {"bfile", required_argument, NULL, OPT_BFILE},
    {"pseudocount", required_argument, NULL, OPT_PSEUDOCOUNT},
    {"inc", required_argument, NULL, OPT_INC},
    {"exc", required_argument, NULL, OPT_EXC},
    {"verbosity", required_argument, NULL, OPT_VERBOSITY},
    {"version", no_argument, NULL, OPT_VERSION},
    {NULL, 0, NULL, 0} //boundary indicator
  };

  // Set options to defaults.
  memset(options, 0, sizeof(STREME_OPTIONS_T));
  options->output_dirname = DEFAULT_OUTPUT_DIRNAME;
  options->allow_clobber = True;
  options->text_only = false;
  options->posfile = NULL;
  options->negfile = NULL;
  options->objfun = DEFAULT_OBJFUN;
  options->kmer = -1;			// flags no option given
  options->alphabet_type = DEFAULT_ALPHABET_TYPE;
  options->alph = NULL;
  options->alph_file = NULL;
  options->minwidth = DEFAULT_MINWIDTH;
  options->maxwidth = DEFAULT_MAXWIDTH;
  options->hofract = DEFAULT_HOFRACT;
  options->seed = DEFAULT_SEED;
  options->motif_sources = arraylst_create();
  options->bg_source = NULL;
  options->pseudocount = DEFAULT_PSEUDOCOUNT;
  options->include_patterns = arraylst_create();
  options->exclude_patterns = arraylst_create();

  // Process arguments.
  while (1) {
    int opt = getopt_long_only(argc, argv, "", streme_options, NULL);
    if (opt == -1) break;
    switch (opt) {
      case OPT_O: // Set output directory with no clobber
        options->output_dirname = optarg;
        options->allow_clobber = false;
        break;
      case OPT_OC: // Set output directory with clobber
        options->output_dirname = optarg;
        options->allow_clobber = True;
        break;
      case OPT_TEXT: // Output text only
        options->text_only = True;
        break;
      case OPT_M:
        arraylst_add(optarg, options->motif_sources);
        break;
      case OPT_P:
        options->posfile = optarg;
        break;
      case OPT_N:
        options->negfile = optarg;
        break;
      case OPT_OBJFUN:
        if (! strcmp(optarg, "de")) {
          options->objfun = DE;
        } else if (! strcmp(optarg, "cd")) {
          options->objfun = CD;
        } else {
          usage("Unknown value for --objfun (%s)\n", optarg);
        }
        break;
      case OPT_KMER:
        options->kmer = atoi(optarg);
        break;
      case OPT_DNA:
        options->alphabet_type = Dna;
        break;
      case OPT_RNA:
        options->alphabet_type = Rna;
        break;
      case OPT_PROTEIN:
        options->alphabet_type = Protein;
        break;
      case OPT_ALPH:
        options->alphabet_type = Custom;
        options->alph_file = optarg;
        break;
      case OPT_HOFRACT:
        options->hofract = atof(optarg);
        break;
      case OPT_SEED:
        options->seed = atoi(optarg);
        break;
      case OPT_BFILE:
        options->bg_source = optarg;
        break;
      case OPT_PSEUDOCOUNT:
	options->pseudocount = atof(optarg);
        break;
      case OPT_INC:
        arraylst_add(optarg, options->include_patterns);
        break;
      case OPT_EXC:
        arraylst_add(optarg, options->exclude_patterns);
        break;
      case OPT_VERBOSITY:
        verbosity = atoi(optarg);
        break;
      case OPT_VERSION:
        fprintf(stdout, VERSION "\n");
        exit(EXIT_SUCCESS);
        break;
      case '?':
        usage(NULL);
        break;
      default: // just in case we forget to handle a option
        die("Unhandled option %d", opt);
    }
  }

  // Check that the input is valid.
  if (options->posfile == NULL) {
    usage("You must supply a FASTA file with the positive sequences.");
  }
  if (options->objfun == CD && options->negfile != NULL) {
    DEBUG_FMT(NORMAL_VERBOSE, "# Warning: ignoring negative sequence file (%s) with --objfun cd.\n", options->negfile);
    options->negfile = NULL;
  }
  // Set the negfile = posfile if it is null and we need it.
  if (options->objfun != CD && options->negfile == NULL) {
    options->negfile = options->posfile;
  }
  if (arraylst_size(options->motif_sources) == 0) {
    usage("You must provide at least one motif file using --m.");
  }
  // Set the MEME-style alphabet and set defaults if they were not give for --kmer.
  if (options->alphabet_type == Dna) {
    options->alph = alph_dna();
    if (options->kmer == -1) options->kmer = DEFAULT_DNA_KMER;
  } else if (options->alphabet_type == Rna) {
    options->alph = alph_rna();
    if (options->kmer == -1) options->kmer = DEFAULT_RNA_KMER;
  } else if (options->alphabet_type == Protein) {
    options->alph = alph_protein();
    if (options->kmer == -1) options->kmer = DEFAULT_PROT_KMER;
  } else if (options->alphabet_type == Custom) {
    options->alph = alph_load(options->alph_file, True); // load custom alphabet
    // Die if a MEME-style alphabet did not load successfully.
    if (options->alph == NULL) exit(EXIT_FAILURE);
    if (options->kmer == -1) options->kmer = DEFAULT_CUSTOM_KMER;
  }
  // Check the verbosity level.
  if (verbosity < 1 || verbosity > 5) {
    usage("The verbosity level must be in the range [1, ..., 5]. verbosity = %d", verbosity);
  }
  // make enough space for all the command line options, with one space between each
  int line_length = 0;
  for (i = 0; i < argc; i++) line_length += strlen(i == 0 ? basename(argv[0]) : argv[i]);
  // add on argc to allow one char per word for separating space + terminal '\0'
  char *commandline = (char *)malloc(sizeof(char)*((size_t)line_length + argc));
  int nextpos = 0;
  for (i = 0; i < argc; i++) {
    // been here before? put in a space before adding the next word
    if (nextpos) {
      commandline[nextpos] = ' ';
      nextpos++;
    }
    char *nextword = (i == 0) ? basename(argv[0]) : argv[i];
    strcpy(&commandline[nextpos], nextword);
    nextpos += strlen (nextword);
  }
  return(commandline);
} // process_command_line

//
// Free the storage and check for leaks.
//
void cleanup(
  STREME_OPTIONS_T *options,		// the program options
  Multiseq *multiseq
) {
  // Free the storage used.
  DEBUG_MSG(NORMAL_VERBOSE, "# Freeing storage...\n");

  // Free the things saved in options.
  alph_release(options->alph);
  arraylst_destroy(NULL, options->include_patterns);
  arraylst_destroy(NULL, options->exclude_patterns);
  arraylst_destroy(NULL, options->motif_sources);

  // free the sequences
  FREESPACE_TLB(multiseq->sequence);
  freemultiseq(multiseq);

  // Check for leaks.
  mmcheckspaceleak();
} // cleanup

/*************************************************************************
 * Read in the motif databases.
 *************************************************************************/
ARRAYLST_T *enr_load_motifs_and_background(
  STREME_OPTIONS_T *options,
  bool separate_namespaces,     // keep motif DB namespaces separate
  bool xalph,                   // convert motifs to alphabet specified in options
  ARRAY_T **background,         // OUT the background
  int *max_width                // OUT maximum motif width
)
{
  int i, num_motifs = 0;
  ARRAYLST_T *dbs = arraylst_create_sized(arraylst_size(options->motif_sources));
  RBTREE_T *motif_names = rbtree_create(rbtree_strcmp, NULL, NULL, NULL, NULL);

  ALPH_T *alph = options->alph;
  assert(alph != NULL);
  bool use_rc = alph_has_complement(alph);

  bool stdin_used = false;      // set if path is "-"
  *max_width = 0;               // maximum motif width
  for (i = 0; i < arraylst_size(options->motif_sources); i++) {

    // Get the name of the next motif db.
    char *motif_source = (char*) arraylst_get(i, options->motif_sources);
    DEBUG_FMT(NORMAL_VERBOSE, "# Loading motifs from file '%s'\n", motif_source);

    // Load the motifs from this file.
    MOTIF_DB_T *db = read_motifs_and_background(
      i,                        // id of DB file
      motif_source,             // motif file name (or special word)
      "Query motifs",           // type of database for error messages
      NULL,                     // get one motif by name
      NULL,                     // get one motif by index
      options->include_patterns,// get set of motifs by name (or NULL)
      options->exclude_patterns,// exclude this set of motifs by name (or NULL)
      true,                     // allow motifs with zero probability entries
      false,                    // don't create RC copies, appended
      options->pseudocount,     // multiply times background model
      false,                    // set_trim
      0,                        // trim_bit_threshold
      &(options->bg_source),    // background file; may be changed
      true,                     // make bg symmetrical if alph complementable
      background,               // will be set if id==0
      options->posfile,          // sequence file name
      alph,                     // sequence alphabet
      xalph,                    // set motif conversion alphabet
      false,                    // don't remove extension from name
      false,                    // don't remove ".meme" extension from name
      false,                    // don't replace underscores in name
      &stdin_used               // IN/OUT check and set if path is "-"
    );

    // Get maximum motif width and check for motif name uniqueness across all DBs.
    int warn_type = NO_WARNING;
    int i;
    for (i=0; i<arraylst_size(db->motifs); i++) {
      MOTIF_T *motif = arraylst_get(i, db->motifs);
      if (! separate_namespaces) {
        bool created;
        RBNODE_T *node = rbtree_lookup(motif_names, get_motif_id(motif), true, &created);
        if (!created) {
          clump_motif_db_warning(&warn_type, DUPLICATE_WARNING, "Warning: The following "
            "duplicate motifs in '%s' were excluded:\n  ", motif_source, get_motif_id(motif));
          destroy_motif(motif);
          db->loaded--;
          db->excluded++;
          continue;
        }
      } // namespaces
      // Get width of widest motif.
      int w = get_motif_length(motif);
      if (w > *max_width) *max_width = w;
    }

    // Add DBs to the list of DBs
    arraylst_add(db, dbs);

    // Number of motifs found.
    num_motifs += arraylst_size(db->motifs);

    if (warn_type && verbosity >= NORMAL_VERBOSE) fprintf(stderr, "\n");
  } // motif_files

  // Check that we found suitable motifs.
  if (num_motifs == 0) die("No acceptable motifs found.");

  // Cleanup
  rbtree_destroy(motif_names);

  return(dbs);
} // enr_load_motifs_and_background

/*************************************************************************
 * Convert motif to STREME model format.
 * Only sets the STREME PSPM matrix from the MEME motif frequency matrix.
 *************************************************************************/
Model *motif2streme(
  MOTIF_T *motif,
  Multiseq *multiseq
) {
  int i, j;
  Model *model = (Model *)malloc(sizeof(Model));
  model->alph = get_motif_alph(motif);
  model->alen = alph_size_core(model->alph);
  model->width = get_motif_length(motif);
  // PSPM
  for (i=0; i<model->alen; i++)
    for (j=0; j<model->width; j++)
      model->probs[i][j] = get_matrix_cell(j, i, get_motif_freqs(motif));
  model->score_threshold = 0;
  return(model);
} // motif2streme

/*
  Compute and print STREME objective function scores for each motif in the input.
*/
int main(int argc, char *argv[])
{
  int i, db_i, motif_i, num_motifs;
  STREME_OPTIONS_T options;
  ARRAY_T *background = NULL;           // the motif background
  ARRAYLST_T *dbs = NULL;               // the motif DBs
  Model **models = NULL;

  // command line processing
  options.commandline = process_command_line(argc, argv, &options);

  srand_mt(options.seed);		// random seed for shuffling sequences
  set_randfunc((randfunc_t) random_mt); // for ushuffle
  INITPOWER2TABLE();			// needed by get_seqno_list()

  // Initialize the alphabet.
  Uint alen = initialize_st_alphabet(options.alph);
  BOOL do_rc = alph_has_complement(options.alph);

  // Load the motifs and the background model.
  int max_width;
  dbs = enr_load_motifs_and_background(
    &options,
    true,               // keep motif DB namespaces separate
    false,              // convert motifs to the user-specified alphabet
    &background,        // OUT background model
    &max_width          // OUT maximum motif width
  );

  // Input the sequences, converting non-core characters to SEPARATOR.
  Multiseq *test_multiseq = NULL;
  Multiseq *multiseq = read_pos_neg_seqs(&options, do_rc, &test_multiseq);

  // Print the header.
  fprintf(stdout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", 
    "ID", "ALT_ID", "POS_MATCHES", "NEG_MATCHES", "SCORE_THR", "RATIO", "PVALUE", "LOG10_PVALUE");

  // Score each motif and print the results as TSV.
  for (db_i = 0, i = 1; db_i < arraylst_size(options.motif_sources); db_i++) {
    MOTIF_DB_T *db = arraylst_get(db_i, dbs);
    num_motifs = arraylst_size(db->motifs); // number of motifs in this DB
    for (motif_i = 0; motif_i < num_motifs; motif_i++) {
      MOTIF_T *motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      // Create the PSSM.
      Model *model = motif2streme(motif, multiseq);
      // Score the motif.
      score_model_pssm(&options, multiseq, model, True, True, NONE, False);
      // Print the result.
      double m1, e1, prec=1; \
      exp10_logx((model)->train_log_pvalue/log(10), m1, e1, prec); \
      fprintf(stdout, "%s\t%s\t%d\t%d\t%.2f\t%.3f\t%3.1fe%+04.0f\t%.2f\n", 
        get_motif_id(motif), get_motif_id2(motif), 
        model->train_pos_count, model->train_neg_count, model->score_threshold, model->train_ratio, m1,e1, model->train_log_pvalue);
      fflush(stdout);
      // Free the model.
      free(model);
    }
  }

  // Clean up.
  cleanup(&options, multiseq);

  return(EXIT_SUCCESS);
} // main
